DataCamp offer interactive courses related to R Programming. While some is review, it is helpful to see other perspectives on material. As well, DataCamp has some interesting materials on packages that I want to learn better (ggplot2, dplyr, ggvis, etc.). This document summarizes a few key insights from:
The original DataCamp_Insights_v001 and DataCamp_Insights_v002 documents have been split for this document:
There are a few nuggets from within these beginning modules, including:
Below is some sample code showing examples for the generic statements:
library(ggplot2)
library(ggthemes)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Factors
xRaw = c("High", "High", "Low", "Low", "Medium", "Very High", "Low")
xFactorNon = factor(xRaw, levels=c("Low", "Medium", "High", "Very High"))
xFactorNon
## [1] High High Low Low Medium Very High Low
## Levels: Low Medium High Very High
xFactorNon[xFactorNon == "High"] > xFactorNon[xFactorNon == "Low"][1]
## Warning in Ops.factor(xFactorNon[xFactorNon == "High"],
## xFactorNon[xFactorNon == : '>' not meaningful for factors
## [1] NA NA
xFactorOrder = factor(xRaw, ordered=TRUE, levels=c("Low", "Medium", "High", "Very High"))
xFactorOrder
## [1] High High Low Low Medium Very High Low
## Levels: Low < Medium < High < Very High
xFactorOrder[xFactorOrder == "High"] > xFactorOrder[xFactorOrder == "Low"][1]
## [1] TRUE TRUE
# Subsets
data(mtcars)
subset(mtcars, mpg>=25)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
identical(subset(mtcars, mpg>=25), mtcars[mtcars$mpg>=25, ])
## [1] TRUE
subset(mtcars, mpg>25, select=c("mpg", "cyl", "disp"))
## mpg cyl disp
## Fiat 128 32.4 4 78.7
## Honda Civic 30.4 4 75.7
## Toyota Corolla 33.9 4 71.1
## Fiat X1-9 27.3 4 79.0
## Porsche 914-2 26.0 4 120.3
## Lotus Europa 30.4 4 95.1
# & and && (same as | and ||)
compA <- c(2, 3, 4, 1, 2, 3)
compB <- c(1, 2, 3, 4, 5, 6)
(compA > compB) & (compA + compB < 6)
## [1] TRUE TRUE FALSE FALSE FALSE FALSE
(compA > compB) | (compA + compB < 6)
## [1] TRUE TRUE TRUE TRUE FALSE FALSE
(compA > compB) && (compA + compB < 6)
## [1] TRUE
(compA > compB) || (compA + compB < 6)
## [1] TRUE
# Loops and cat()
# for (a in b) {
# do stuff
# if (exitCond) { break }
# if (nextCond) { next }
# do some more stuff
# }
for (myVal in compA*compB) {
print(paste0("myVal is: ", myVal))
if ((myVal %% 3) == 0) { cat("Divisible by 3, not happy about that\n\n"); next }
print("That is not divisible by 3")
if ((myVal %% 5) == 0) { cat("Exiting due to divisible by 5 but not divisible by 3\n\n"); break }
cat("Onwards and upwards\n\n")
}
## [1] "myVal is: 2"
## [1] "That is not divisible by 3"
## Onwards and upwards
##
## [1] "myVal is: 6"
## Divisible by 3, not happy about that
##
## [1] "myVal is: 12"
## Divisible by 3, not happy about that
##
## [1] "myVal is: 4"
## [1] "That is not divisible by 3"
## Onwards and upwards
##
## [1] "myVal is: 10"
## [1] "That is not divisible by 3"
## Exiting due to divisible by 5 but not divisible by 3
# args() and search()
args(plot.default)
## function (x, y = NULL, type = "p", xlim = NULL, ylim = NULL,
## log = "", main = NULL, sub = NULL, xlab = NULL, ylab = NULL,
## ann = par("ann"), axes = TRUE, frame.plot = axes, panel.first = NULL,
## panel.last = NULL, asp = NA, ...)
## NULL
search()
## [1] ".GlobalEnv" "package:dplyr" "package:ggthemes"
## [4] "package:ggplot2" "package:stats" "package:graphics"
## [7] "package:grDevices" "package:utils" "package:datasets"
## [10] "package:methods" "Autoloads" "package:base"
# unique()
compA
## [1] 2 3 4 1 2 3
unique(compA)
## [1] 2 3 4 1
# unlist()
listA <- as.list(compA)
unlist(listA)
## [1] 2 3 4 1 2 3
identical(compA, unlist(listA))
## [1] TRUE
# sort()
sort(mtcars$mpg)
## [1] 10.4 10.4 13.3 14.3 14.7 15.0 15.2 15.2 15.5 15.8 16.4 17.3 17.8 18.1
## [15] 18.7 19.2 19.2 19.7 21.0 21.0 21.4 21.4 21.5 22.8 22.8 24.4 26.0 27.3
## [29] 30.4 30.4 32.4 33.9
sort(mtcars$mpg, decreasing=TRUE)
## [1] 33.9 32.4 30.4 30.4 27.3 26.0 24.4 22.8 22.8 21.5 21.4 21.4 21.0 21.0
## [15] 19.7 19.2 19.2 18.7 18.1 17.8 17.3 16.4 15.8 15.5 15.2 15.2 15.0 14.7
## [29] 14.3 13.3 10.4 10.4
# rep()
rep(1:6, times=2) # 1:6 followed by 1:6
## [1] 1 2 3 4 5 6 1 2 3 4 5 6
rep(1:6, each=2) # 1 1 2 2 3 3 4 4 5 5 6 6
## [1] 1 1 2 2 3 3 4 4 5 5 6 6
rep(1:6, times=2, each=3) # 1 1 1 2 2 2 3 3 3 4 4 4 5 5 5 6 6 6 repeated twice (each comes first)
## [1] 1 1 1 2 2 2 3 3 3 4 4 4 5 5 5 6 6 6 1 1 1 2 2 2 3 3 3 4 4 4 5 5 5 6 6
## [36] 6
rep(1:6, times=6:1) # 1 1 1 1 1 1 2 2 2 2 2 3 3 3 3 4 4 4 5 5 6
## [1] 1 1 1 1 1 1 2 2 2 2 2 3 3 3 3 4 4 4 5 5 6
# append()
myWords <- c("The", "cat", "in", "the", "hat")
paste(append(myWords, c("is", "fun", "to", "read")), collapse=" ")
## [1] "The cat in the hat is fun to read"
paste(append(myWords, "funny", 4), collapse=" ")
## [1] "The cat in the funny hat"
# grep("//1")
sampMsg <- "This is from myname@subdomain.mydomain.com again"
gsub("(^.*\\w*[a-zA-Z0-9]+@)([a-zA-Z0-9]+\\.[a-zA-Z0-9.]+)(.*$)", "\\1", sampMsg)
## [1] "This is from myname@"
gsub("(^.*\\w*[a-zA-Z0-9]+@)([a-zA-Z0-9]+\\.[a-zA-Z0-9.]+)(.*$)", "\\2", sampMsg)
## [1] "subdomain.mydomain.com"
gsub("(^.*\\w*[a-zA-Z0-9]+@)([a-zA-Z0-9]+\\.[a-zA-Z0-9.]+)(.*$)", "\\3", sampMsg)
## [1] " again"
# rev()
compA
## [1] 2 3 4 1 2 3
rev(compA)
## [1] 3 2 1 4 3 2
Below is some sample code showing examples for the apply statements:
# lapply
args(lapply)
## function (X, FUN, ...)
## NULL
lapply(1:5, FUN=sqrt)
## [[1]]
## [1] 1
##
## [[2]]
## [1] 1.414214
##
## [[3]]
## [1] 1.732051
##
## [[4]]
## [1] 2
##
## [[5]]
## [1] 2.236068
lapply(1:5, FUN=function(x, y=2) { c(x=x, y=y, pow=x^y) }, y=3)
## [[1]]
## x y pow
## 1 3 1
##
## [[2]]
## x y pow
## 2 3 8
##
## [[3]]
## x y pow
## 3 3 27
##
## [[4]]
## x y pow
## 4 3 64
##
## [[5]]
## x y pow
## 5 3 125
lapply(1:5, FUN=function(x, y=2) { if (x <= 3) {c(x=x, y=y, pow=x^y) } else { c(pow=x^y) } }, y=3)
## [[1]]
## x y pow
## 1 3 1
##
## [[2]]
## x y pow
## 2 3 8
##
## [[3]]
## x y pow
## 3 3 27
##
## [[4]]
## pow
## 64
##
## [[5]]
## pow
## 125
# sapply (defaults to returning a named vector/array if possible; is lapply otherwise)
args(sapply)
## function (X, FUN, ..., simplify = TRUE, USE.NAMES = TRUE)
## NULL
args(simplify2array)
## function (x, higher = TRUE)
## NULL
sapply(1:5, FUN=sqrt)
## [1] 1.000000 1.414214 1.732051 2.000000 2.236068
sapply(1:5, FUN=function(x, y=2) { c(x=x, y=y, pow=x^y) }, y=3)
## [,1] [,2] [,3] [,4] [,5]
## x 1 2 3 4 5
## y 3 3 3 3 3
## pow 1 8 27 64 125
sapply(1:5, FUN=function(x, y=2) { if (x <= 3) {c(x=x, y=y, pow=x^y) } else { c(pow=x^y) } }, y=3)
## [[1]]
## x y pow
## 1 3 1
##
## [[2]]
## x y pow
## 2 3 8
##
## [[3]]
## x y pow
## 3 3 27
##
## [[4]]
## pow
## 64
##
## [[5]]
## pow
## 125
# vapply (tells sapply exactly what should be returned; errors out otherwise)
args(vapply)
## function (X, FUN, FUN.VALUE, ..., USE.NAMES = TRUE)
## NULL
vapply(1:5, FUN=sqrt, FUN.VALUE=numeric(1))
## [1] 1.000000 1.414214 1.732051 2.000000 2.236068
vapply(1:5, FUN=function(x, y=2) { c(x=x, y=y, pow=x^y) }, FUN.VALUE=numeric(3), y=3)
## [,1] [,2] [,3] [,4] [,5]
## x 1 2 3 4 5
## y 3 3 3 3 3
## pow 1 8 27 64 125
Below is some sample code for handing dates and times in R:
Sys.Date()
## [1] "2017-11-07"
Sys.time()
## [1] "2017-11-07 08:48:18 CST"
args(strptime)
## function (x, format, tz = "")
## NULL
rightNow <- as.POSIXct(Sys.time())
format(rightNow, "%Y**%M-%d %H hours and %M minutes", usetz=TRUE)
## [1] "2017**48-07 08 hours and 48 minutes CST"
lastChristmasNoon <- as.POSIXct("2015-12-25 12:00:00", format="%Y-%m-%d %X")
rightNow - lastChristmasNoon
## Time difference of 682.8669 days
nextUMHomeGame <- as.POSIXct("16/SEP/3 12:00:00", format="%y/%b/%d %H:%M:%S", tz="America/Detroit")
nextUMHomeGame - rightNow
## Time difference of -429.9502 days
# Time zones available in R
OlsonNames()
## [1] "Africa/Abidjan" "Africa/Accra"
## [3] "Africa/Addis_Ababa" "Africa/Algiers"
## [5] "Africa/Asmara" "Africa/Asmera"
## [7] "Africa/Bamako" "Africa/Bangui"
## [9] "Africa/Banjul" "Africa/Bissau"
## [11] "Africa/Blantyre" "Africa/Brazzaville"
## [13] "Africa/Bujumbura" "Africa/Cairo"
## [15] "Africa/Casablanca" "Africa/Ceuta"
## [17] "Africa/Conakry" "Africa/Dakar"
## [19] "Africa/Dar_es_Salaam" "Africa/Djibouti"
## [21] "Africa/Douala" "Africa/El_Aaiun"
## [23] "Africa/Freetown" "Africa/Gaborone"
## [25] "Africa/Harare" "Africa/Johannesburg"
## [27] "Africa/Juba" "Africa/Kampala"
## [29] "Africa/Khartoum" "Africa/Kigali"
## [31] "Africa/Kinshasa" "Africa/Lagos"
## [33] "Africa/Libreville" "Africa/Lome"
## [35] "Africa/Luanda" "Africa/Lubumbashi"
## [37] "Africa/Lusaka" "Africa/Malabo"
## [39] "Africa/Maputo" "Africa/Maseru"
## [41] "Africa/Mbabane" "Africa/Mogadishu"
## [43] "Africa/Monrovia" "Africa/Nairobi"
## [45] "Africa/Ndjamena" "Africa/Niamey"
## [47] "Africa/Nouakchott" "Africa/Ouagadougou"
## [49] "Africa/Porto-Novo" "Africa/Sao_Tome"
## [51] "Africa/Timbuktu" "Africa/Tripoli"
## [53] "Africa/Tunis" "Africa/Windhoek"
## [55] "America/Adak" "America/Anchorage"
## [57] "America/Anguilla" "America/Antigua"
## [59] "America/Araguaina" "America/Argentina/Buenos_Aires"
## [61] "America/Argentina/Catamarca" "America/Argentina/ComodRivadavia"
## [63] "America/Argentina/Cordoba" "America/Argentina/Jujuy"
## [65] "America/Argentina/La_Rioja" "America/Argentina/Mendoza"
## [67] "America/Argentina/Rio_Gallegos" "America/Argentina/Salta"
## [69] "America/Argentina/San_Juan" "America/Argentina/San_Luis"
## [71] "America/Argentina/Tucuman" "America/Argentina/Ushuaia"
## [73] "America/Aruba" "America/Asuncion"
## [75] "America/Atikokan" "America/Atka"
## [77] "America/Bahia" "America/Bahia_Banderas"
## [79] "America/Barbados" "America/Belem"
## [81] "America/Belize" "America/Blanc-Sablon"
## [83] "America/Boa_Vista" "America/Bogota"
## [85] "America/Boise" "America/Buenos_Aires"
## [87] "America/Cambridge_Bay" "America/Campo_Grande"
## [89] "America/Cancun" "America/Caracas"
## [91] "America/Catamarca" "America/Cayenne"
## [93] "America/Cayman" "America/Chicago"
## [95] "America/Chihuahua" "America/Coral_Harbour"
## [97] "America/Cordoba" "America/Costa_Rica"
## [99] "America/Creston" "America/Cuiaba"
## [101] "America/Curacao" "America/Danmarkshavn"
## [103] "America/Dawson" "America/Dawson_Creek"
## [105] "America/Denver" "America/Detroit"
## [107] "America/Dominica" "America/Edmonton"
## [109] "America/Eirunepe" "America/El_Salvador"
## [111] "America/Ensenada" "America/Fort_Nelson"
## [113] "America/Fort_Wayne" "America/Fortaleza"
## [115] "America/Glace_Bay" "America/Godthab"
## [117] "America/Goose_Bay" "America/Grand_Turk"
## [119] "America/Grenada" "America/Guadeloupe"
## [121] "America/Guatemala" "America/Guayaquil"
## [123] "America/Guyana" "America/Halifax"
## [125] "America/Havana" "America/Hermosillo"
## [127] "America/Indiana/Indianapolis" "America/Indiana/Knox"
## [129] "America/Indiana/Marengo" "America/Indiana/Petersburg"
## [131] "America/Indiana/Tell_City" "America/Indiana/Vevay"
## [133] "America/Indiana/Vincennes" "America/Indiana/Winamac"
## [135] "America/Indianapolis" "America/Inuvik"
## [137] "America/Iqaluit" "America/Jamaica"
## [139] "America/Jujuy" "America/Juneau"
## [141] "America/Kentucky/Louisville" "America/Kentucky/Monticello"
## [143] "America/Knox_IN" "America/Kralendijk"
## [145] "America/La_Paz" "America/Lima"
## [147] "America/Los_Angeles" "America/Louisville"
## [149] "America/Lower_Princes" "America/Maceio"
## [151] "America/Managua" "America/Manaus"
## [153] "America/Marigot" "America/Martinique"
## [155] "America/Matamoros" "America/Mazatlan"
## [157] "America/Mendoza" "America/Menominee"
## [159] "America/Merida" "America/Metlakatla"
## [161] "America/Mexico_City" "America/Miquelon"
## [163] "America/Moncton" "America/Monterrey"
## [165] "America/Montevideo" "America/Montreal"
## [167] "America/Montserrat" "America/Nassau"
## [169] "America/New_York" "America/Nipigon"
## [171] "America/Nome" "America/Noronha"
## [173] "America/North_Dakota/Beulah" "America/North_Dakota/Center"
## [175] "America/North_Dakota/New_Salem" "America/Ojinaga"
## [177] "America/Panama" "America/Pangnirtung"
## [179] "America/Paramaribo" "America/Phoenix"
## [181] "America/Port-au-Prince" "America/Port_of_Spain"
## [183] "America/Porto_Acre" "America/Porto_Velho"
## [185] "America/Puerto_Rico" "America/Rainy_River"
## [187] "America/Rankin_Inlet" "America/Recife"
## [189] "America/Regina" "America/Resolute"
## [191] "America/Rio_Branco" "America/Rosario"
## [193] "America/Santa_Isabel" "America/Santarem"
## [195] "America/Santiago" "America/Santo_Domingo"
## [197] "America/Sao_Paulo" "America/Scoresbysund"
## [199] "America/Shiprock" "America/Sitka"
## [201] "America/St_Barthelemy" "America/St_Johns"
## [203] "America/St_Kitts" "America/St_Lucia"
## [205] "America/St_Thomas" "America/St_Vincent"
## [207] "America/Swift_Current" "America/Tegucigalpa"
## [209] "America/Thule" "America/Thunder_Bay"
## [211] "America/Tijuana" "America/Toronto"
## [213] "America/Tortola" "America/Vancouver"
## [215] "America/Virgin" "America/Whitehorse"
## [217] "America/Winnipeg" "America/Yakutat"
## [219] "America/Yellowknife" "Antarctica/Casey"
## [221] "Antarctica/Davis" "Antarctica/DumontDUrville"
## [223] "Antarctica/Macquarie" "Antarctica/Mawson"
## [225] "Antarctica/McMurdo" "Antarctica/Palmer"
## [227] "Antarctica/Rothera" "Antarctica/South_Pole"
## [229] "Antarctica/Syowa" "Antarctica/Troll"
## [231] "Antarctica/Vostok" "Arctic/Longyearbyen"
## [233] "Asia/Aden" "Asia/Almaty"
## [235] "Asia/Amman" "Asia/Anadyr"
## [237] "Asia/Aqtau" "Asia/Aqtobe"
## [239] "Asia/Ashgabat" "Asia/Ashkhabad"
## [241] "Asia/Atyrau" "Asia/Baghdad"
## [243] "Asia/Bahrain" "Asia/Baku"
## [245] "Asia/Bangkok" "Asia/Barnaul"
## [247] "Asia/Beirut" "Asia/Bishkek"
## [249] "Asia/Brunei" "Asia/Calcutta"
## [251] "Asia/Chita" "Asia/Choibalsan"
## [253] "Asia/Chongqing" "Asia/Chungking"
## [255] "Asia/Colombo" "Asia/Dacca"
## [257] "Asia/Damascus" "Asia/Dhaka"
## [259] "Asia/Dili" "Asia/Dubai"
## [261] "Asia/Dushanbe" "Asia/Famagusta"
## [263] "Asia/Gaza" "Asia/Harbin"
## [265] "Asia/Hebron" "Asia/Ho_Chi_Minh"
## [267] "Asia/Hong_Kong" "Asia/Hovd"
## [269] "Asia/Irkutsk" "Asia/Istanbul"
## [271] "Asia/Jakarta" "Asia/Jayapura"
## [273] "Asia/Jerusalem" "Asia/Kabul"
## [275] "Asia/Kamchatka" "Asia/Karachi"
## [277] "Asia/Kashgar" "Asia/Kathmandu"
## [279] "Asia/Katmandu" "Asia/Khandyga"
## [281] "Asia/Kolkata" "Asia/Krasnoyarsk"
## [283] "Asia/Kuala_Lumpur" "Asia/Kuching"
## [285] "Asia/Kuwait" "Asia/Macao"
## [287] "Asia/Macau" "Asia/Magadan"
## [289] "Asia/Makassar" "Asia/Manila"
## [291] "Asia/Muscat" "Asia/Nicosia"
## [293] "Asia/Novokuznetsk" "Asia/Novosibirsk"
## [295] "Asia/Omsk" "Asia/Oral"
## [297] "Asia/Phnom_Penh" "Asia/Pontianak"
## [299] "Asia/Pyongyang" "Asia/Qatar"
## [301] "Asia/Qyzylorda" "Asia/Rangoon"
## [303] "Asia/Riyadh" "Asia/Saigon"
## [305] "Asia/Sakhalin" "Asia/Samarkand"
## [307] "Asia/Seoul" "Asia/Shanghai"
## [309] "Asia/Singapore" "Asia/Srednekolymsk"
## [311] "Asia/Taipei" "Asia/Tashkent"
## [313] "Asia/Tbilisi" "Asia/Tehran"
## [315] "Asia/Tel_Aviv" "Asia/Thimbu"
## [317] "Asia/Thimphu" "Asia/Tokyo"
## [319] "Asia/Tomsk" "Asia/Ujung_Pandang"
## [321] "Asia/Ulaanbaatar" "Asia/Ulan_Bator"
## [323] "Asia/Urumqi" "Asia/Ust-Nera"
## [325] "Asia/Vientiane" "Asia/Vladivostok"
## [327] "Asia/Yakutsk" "Asia/Yangon"
## [329] "Asia/Yekaterinburg" "Asia/Yerevan"
## [331] "Atlantic/Azores" "Atlantic/Bermuda"
## [333] "Atlantic/Canary" "Atlantic/Cape_Verde"
## [335] "Atlantic/Faeroe" "Atlantic/Faroe"
## [337] "Atlantic/Jan_Mayen" "Atlantic/Madeira"
## [339] "Atlantic/Reykjavik" "Atlantic/South_Georgia"
## [341] "Atlantic/St_Helena" "Atlantic/Stanley"
## [343] "Australia/ACT" "Australia/Adelaide"
## [345] "Australia/Brisbane" "Australia/Broken_Hill"
## [347] "Australia/Canberra" "Australia/Currie"
## [349] "Australia/Darwin" "Australia/Eucla"
## [351] "Australia/Hobart" "Australia/LHI"
## [353] "Australia/Lindeman" "Australia/Lord_Howe"
## [355] "Australia/Melbourne" "Australia/North"
## [357] "Australia/NSW" "Australia/Perth"
## [359] "Australia/Queensland" "Australia/South"
## [361] "Australia/Sydney" "Australia/Tasmania"
## [363] "Australia/Victoria" "Australia/West"
## [365] "Australia/Yancowinna" "Brazil/Acre"
## [367] "Brazil/DeNoronha" "Brazil/East"
## [369] "Brazil/West" "Canada/Atlantic"
## [371] "Canada/Central" "Canada/East-Saskatchewan"
## [373] "Canada/Eastern" "Canada/Mountain"
## [375] "Canada/Newfoundland" "Canada/Pacific"
## [377] "Canada/Saskatchewan" "Canada/Yukon"
## [379] "CET" "Chile/Continental"
## [381] "Chile/EasterIsland" "CST6CDT"
## [383] "Cuba" "EET"
## [385] "Egypt" "Eire"
## [387] "EST" "EST5EDT"
## [389] "Etc/GMT" "Etc/GMT-0"
## [391] "Etc/GMT-1" "Etc/GMT-10"
## [393] "Etc/GMT-11" "Etc/GMT-12"
## [395] "Etc/GMT-13" "Etc/GMT-14"
## [397] "Etc/GMT-2" "Etc/GMT-3"
## [399] "Etc/GMT-4" "Etc/GMT-5"
## [401] "Etc/GMT-6" "Etc/GMT-7"
## [403] "Etc/GMT-8" "Etc/GMT-9"
## [405] "Etc/GMT+0" "Etc/GMT+1"
## [407] "Etc/GMT+10" "Etc/GMT+11"
## [409] "Etc/GMT+12" "Etc/GMT+2"
## [411] "Etc/GMT+3" "Etc/GMT+4"
## [413] "Etc/GMT+5" "Etc/GMT+6"
## [415] "Etc/GMT+7" "Etc/GMT+8"
## [417] "Etc/GMT+9" "Etc/GMT0"
## [419] "Etc/Greenwich" "Etc/UCT"
## [421] "Etc/Universal" "Etc/UTC"
## [423] "Etc/Zulu" "Europe/Amsterdam"
## [425] "Europe/Andorra" "Europe/Astrakhan"
## [427] "Europe/Athens" "Europe/Belfast"
## [429] "Europe/Belgrade" "Europe/Berlin"
## [431] "Europe/Bratislava" "Europe/Brussels"
## [433] "Europe/Bucharest" "Europe/Budapest"
## [435] "Europe/Busingen" "Europe/Chisinau"
## [437] "Europe/Copenhagen" "Europe/Dublin"
## [439] "Europe/Gibraltar" "Europe/Guernsey"
## [441] "Europe/Helsinki" "Europe/Isle_of_Man"
## [443] "Europe/Istanbul" "Europe/Jersey"
## [445] "Europe/Kaliningrad" "Europe/Kiev"
## [447] "Europe/Kirov" "Europe/Lisbon"
## [449] "Europe/Ljubljana" "Europe/London"
## [451] "Europe/Luxembourg" "Europe/Madrid"
## [453] "Europe/Malta" "Europe/Mariehamn"
## [455] "Europe/Minsk" "Europe/Monaco"
## [457] "Europe/Moscow" "Europe/Nicosia"
## [459] "Europe/Oslo" "Europe/Paris"
## [461] "Europe/Podgorica" "Europe/Prague"
## [463] "Europe/Riga" "Europe/Rome"
## [465] "Europe/Samara" "Europe/San_Marino"
## [467] "Europe/Sarajevo" "Europe/Saratov"
## [469] "Europe/Simferopol" "Europe/Skopje"
## [471] "Europe/Sofia" "Europe/Stockholm"
## [473] "Europe/Tallinn" "Europe/Tirane"
## [475] "Europe/Tiraspol" "Europe/Ulyanovsk"
## [477] "Europe/Uzhgorod" "Europe/Vaduz"
## [479] "Europe/Vatican" "Europe/Vienna"
## [481] "Europe/Vilnius" "Europe/Volgograd"
## [483] "Europe/Warsaw" "Europe/Zagreb"
## [485] "Europe/Zaporozhye" "Europe/Zurich"
## [487] "GB" "GB-Eire"
## [489] "GMT" "GMT-0"
## [491] "GMT+0" "GMT0"
## [493] "Greenwich" "Hongkong"
## [495] "HST" "Iceland"
## [497] "Indian/Antananarivo" "Indian/Chagos"
## [499] "Indian/Christmas" "Indian/Cocos"
## [501] "Indian/Comoro" "Indian/Kerguelen"
## [503] "Indian/Mahe" "Indian/Maldives"
## [505] "Indian/Mauritius" "Indian/Mayotte"
## [507] "Indian/Reunion" "Iran"
## [509] "Israel" "Jamaica"
## [511] "Japan" "Kwajalein"
## [513] "Libya" "MET"
## [515] "Mexico/BajaNorte" "Mexico/BajaSur"
## [517] "Mexico/General" "MST"
## [519] "MST7MDT" "Navajo"
## [521] "NZ" "NZ-CHAT"
## [523] "Pacific/Apia" "Pacific/Auckland"
## [525] "Pacific/Bougainville" "Pacific/Chatham"
## [527] "Pacific/Chuuk" "Pacific/Easter"
## [529] "Pacific/Efate" "Pacific/Enderbury"
## [531] "Pacific/Fakaofo" "Pacific/Fiji"
## [533] "Pacific/Funafuti" "Pacific/Galapagos"
## [535] "Pacific/Gambier" "Pacific/Guadalcanal"
## [537] "Pacific/Guam" "Pacific/Honolulu"
## [539] "Pacific/Johnston" "Pacific/Kiritimati"
## [541] "Pacific/Kosrae" "Pacific/Kwajalein"
## [543] "Pacific/Majuro" "Pacific/Marquesas"
## [545] "Pacific/Midway" "Pacific/Nauru"
## [547] "Pacific/Niue" "Pacific/Norfolk"
## [549] "Pacific/Noumea" "Pacific/Pago_Pago"
## [551] "Pacific/Palau" "Pacific/Pitcairn"
## [553] "Pacific/Pohnpei" "Pacific/Ponape"
## [555] "Pacific/Port_Moresby" "Pacific/Rarotonga"
## [557] "Pacific/Saipan" "Pacific/Samoa"
## [559] "Pacific/Tahiti" "Pacific/Tarawa"
## [561] "Pacific/Tongatapu" "Pacific/Truk"
## [563] "Pacific/Wake" "Pacific/Wallis"
## [565] "Pacific/Yap" "Poland"
## [567] "Portugal" "PRC"
## [569] "PST8PDT" "ROC"
## [571] "ROK" "Singapore"
## [573] "Turkey" "UCT"
## [575] "Universal" "US/Alaska"
## [577] "US/Aleutian" "US/Arizona"
## [579] "US/Central" "US/East-Indiana"
## [581] "US/Eastern" "US/Hawaii"
## [583] "US/Indiana-Starke" "US/Michigan"
## [585] "US/Mountain" "US/Pacific"
## [587] "US/Pacific-New" "US/Samoa"
## [589] "UTC" "VERSION"
## [591] "W-SU" "WET"
## [593] "Zulu"
# From ?strptime (excerpted)
#
# ** General formats **
# %c Date and time. Locale-specific on output, "%a %b %e %H:%M:%S %Y" on input.
# %F Equivalent to %Y-%m-%d (the ISO 8601 date format).
# %T Equivalent to %H:%M:%S.
# %D Date format such as %m/%d/%y: the C99 standard says it should be that exact format
# %x Date. Locale-specific on output, "%y/%m/%d" on input.
# %X Time. Locale-specific on output, "%H:%M:%S" on input.
#
# ** Key Components **
# %y Year without century (00-99). On input, values 00 to 68 are prefixed by 20 and 69 to 99 by 19
# %Y Year with century
# %m Month as decimal number (01-12).
# %b Abbreviated month name in the current locale on this platform.
# %B Full month name in the current locale.
# %d Day of the month as decimal number (01-31).
# %e Day of the month as decimal number (1-31), with a leading space for a single-digit number.
# %a Abbreviated weekday name in the current locale on this platform.
# %A Full weekday name in the current locale.
# %H Hours as decimal number (00-23)
# %I Hours as decimal number (01-12)
# %M Minute as decimal number (00-59).
# %S Second as integer (00-61), allowing for up to two leap-seconds (but POSIX-compliant implementations will ignore leap seconds).
#
# ** Additional Options **
# %C Century (00-99): the integer part of the year divided by 100.
#
# %g The last two digits of the week-based year (see %V). (Accepted but ignored on input.)
# %G The week-based year (see %V) as a decimal number. (Accepted but ignored on input.)
#
# %h Equivalent to %b.
#
# %j Day of year as decimal number (001-366).
#
# %n Newline on output, arbitrary whitespace on input.
#
# %p AM/PM indicator in the locale. Used in conjunction with %I and not with %H. An empty string in some locales (and the behaviour is undefined if used for input in such a locale). Some platforms accept %P for output, which uses a lower-case version: others will output P.
#
# %r The 12-hour clock time (using the locale's AM or PM). Only defined in some locales.
#
# %R Equivalent to %H:%M.
#
# %t Tab on output, arbitrary whitespace on input.
#
# %u Weekday as a decimal number (1-7, Monday is 1).
#
# %U Week of the year as decimal number (00-53) using Sunday as the first day 1 of the week (and typically with the first Sunday of the year as day 1 of week 1). The US convention.
#
# %V Week of the year as decimal number (01-53) as defined in ISO 8601. If the week (starting on Monday) containing 1 January has four or more days in the new year, then it is considered week 1. Otherwise, it is the last week of the previous year, and the next week is week 1. (Accepted but ignored on input.)
#
# %w Weekday as decimal number (0-6, Sunday is 0).
#
# %W Week of the year as decimal number (00-53) using Monday as the first day of week (and typically with the first Monday of the year as day 1 of week 1). The UK convention.
#
# For input, only years 0:9999 are accepted.
#
# %z Signed offset in hours and minutes from UTC, so -0800 is 8 hours behind UTC. Values up to +1400 are accepted as from R 3.1.1: previous versions only accepted up to +1200. (Standard only for output.)
#
# %Z (Output only.) Time zone abbreviation as a character string (empty if not available). This may not be reliable when a time zone has changed abbreviations over the years.
Additionally, code from several practice examples is added:
set.seed(1608221310)
me <- 89
other_199 <- round(rnorm(199, mean=75.45, sd=11.03), 0)
mean(other_199)
## [1] 75.17588
sd(other_199)
## [1] 11.37711
desMeans <- c(72.275, 76.24, 74.5, 77.695)
desSD <- c(12.31, 11.22, 12.5, 12.53)
prevData <- c(rnorm(200, mean=72.275, sd=12.31),
rnorm(200, mean=76.24, sd=11.22),
rnorm(200, mean=74.5, sd=12.5),
rnorm(200, mean=77.695, sd=12.53)
)
previous_4 <- matrix(data=prevData, ncol=4)
curMeans <- apply(previous_4, 2, FUN=mean)
curSD <- apply(previous_4, 2, FUN=sd)
previous_4 <- t(apply(previous_4, 1, FUN=function(x) { desMeans + (desSD / curSD) * (x - curMeans) } ))
apply(round(previous_4, 0), 2, FUN=mean)
## [1] 72.285 76.245 74.505 77.665
apply(round(previous_4, 0), 2, FUN=sd)
## [1] 12.35097 11.19202 12.49643 12.51744
previous_4 <- round(previous_4, 0)
# Merge me and other_199: my_class
my_class <- c(me, other_199)
# cbind() my_class and previous_4: last_5
last_5 <- cbind(my_class, previous_4)
# Name last_5 appropriately
nms <- paste0("year_", 1:5)
colnames(last_5) <- nms
# Build histogram of my_class
hist(my_class)
# Generate summary of last_5
summary(last_5)
## year_1 year_2 year_3 year_4
## Min. : 46.00 Min. : 43.00 Min. : 38.00 Min. : 42.00
## 1st Qu.: 68.00 1st Qu.: 63.75 1st Qu.: 69.00 1st Qu.: 65.75
## Median : 75.50 Median : 73.00 Median : 76.50 Median : 74.00
## Mean : 75.25 Mean : 72.28 Mean : 76.25 Mean : 74.50
## 3rd Qu.: 83.25 3rd Qu.: 81.00 3rd Qu.: 84.25 3rd Qu.: 82.25
## Max. :108.00 Max. :108.00 Max. :102.00 Max. :113.00
## year_5
## Min. : 38.00
## 1st Qu.: 71.00
## Median : 78.00
## Mean : 77.67
## 3rd Qu.: 86.00
## Max. :117.00
# Build boxplot of last_5
boxplot(last_5)
# How many grades in your class are higher than 75?
sum(my_class > 75)
## [1] 100
# How many students in your class scored strictly higher than you?
sum(my_class > me)
## [1] 17
# What's the proportion of grades below or equal to 64 in the last 5 years?
mean(last_5 <= 64)
## [1] 0.191
# Is your grade greater than 87 and smaller than or equal to 89?
me > 87 & me <= 89
## [1] TRUE
# Which grades in your class are below 60 or above 90?
my_class < 60 | my_class > 90
## [1] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## [23] TRUE FALSE TRUE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE
## [34] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [45] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [56] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [67] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE FALSE
## [78] FALSE TRUE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE
## [89] TRUE FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE TRUE FALSE
## [100] FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [111] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [122] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE FALSE FALSE
## [133] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [144] TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [155] FALSE TRUE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## [166] FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
## [177] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [188] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## [199] FALSE FALSE
# What's the proportion of grades in your class that is average?
mean(my_class >= 70 & my_class <= 85)
## [1] 0.525
# How many students in the last 5 years had a grade of 80 or 90?
sum(last_5 %in% c(80, 90))
## [1] 44
# Define n_smart
n_smart <- sum(my_class >= 80)
# Code the if-else construct
if (n_smart > 50) {
print("smart class")
} else {
print("rather average")
}
## [1] "smart class"
# Define prop_less
prop_less <- mean(my_class < me)
# Code the control construct
if (prop_less > 0.9) {
print("you're among the best 10 percent")
} else if (prop_less > 0.8) {
print("you're among the best 20 percent")
} else {
print("need more analysis")
}
## [1] "you're among the best 20 percent"
# Embedded control structure: fix the error
if (mean(my_class) < 75) {
if (mean(my_class) > me) {
print("average year, but still smarter than me")
} else {
print("average year, but I'm not that bad")
}
} else {
if (mean(my_class) > me) {
print("smart year, even smarter than me")
} else {
print("smart year, but I am smarter")
}
}
## [1] "smart year, but I am smarter"
# Create top_grades
top_grades <- my_class[my_class >= 85]
# Create worst_grades
worst_grades <- my_class[my_class < 65]
# Write conditional statement
if (length(top_grades) > length(worst_grades)) { print("top grades prevail") }
## [1] "top grades prevail"
Hadley and Charlotte Wickham led a course on writing functions in R. Broadly, the course includes advice on when/how to use functions, as well as specific advice about commands available through library(purrr).
Key pieces of advice include:
John Chambers gave a few useful slogans about functions:
Each function has three components:
Only the LAST evaluated expression is returned. The use of return() is recommended only for early-returns in a special case (for example, when a break() will be called).
Further, functions can be written anonymously on the command line, such as (function (x) {x + 1}) (1:5). A function should only depend on arguments passed to it, not variables from a parent enviornment. Every time the function is called, it receives a clean working environment. Once it finishes, its variables are no longer available unless they were returned (either by default as the last operation, or by way of return()):
# Components of a function
args(rnorm)
## function (n, mean = 0, sd = 1)
## NULL
formals(rnorm)
## $n
##
##
## $mean
## [1] 0
##
## $sd
## [1] 1
body(rnorm)
## .Call(C_rnorm, n, mean, sd)
environment(rnorm)
## <environment: namespace:stats>
# What is passed back
funDummy <- function(x) {
if (x <= 2) {
print("That is too small")
return(3) # This ends the function by convention
}
ceiling(x) # This is the defaulted return() value if nothing happened to prevent the code getting here
}
funDummy(1)
## [1] "That is too small"
## [1] 3
funDummy(5)
## [1] 5
# Anonymous functions
(function (x) {x + 1}) (1:5)
## [1] 2 3 4 5 6
The course includes some insightful discussion of vectors. As it happens, lists and data frames are just special collections of vectors in R. Each column of a data frame is a vector, while each element of a list is either 1) an embedded data frame (which is eventually a vector by way of columns), 2) an embedded list (which is eventually a vector by way of recursion), or 3) an actual vector.
The atomic vectors are of types logical, integer, character, and double; complex and raw are rarer types that are also available. Lists are just recursive vectors, which is to say that lists can contain other lists and can be hetergeneous. To explore vectors, you have:
Note that NULL is the absence of a vector and has length 0. NA is the absence of an element in the vector and has length 1. All math operations with NA return NA; for example NA == NA will return NA.
There are some good tips on extracting element from a list:
# Data types
data(mtcars)
str(mtcars)
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
typeof(mtcars) # n.b. that this is technically a "list"
## [1] "list"
length(mtcars)
## [1] 11
# NULL and NA
length(NULL)
## [1] 0
typeof(NULL)
## [1] "NULL"
length(NA)
## [1] 1
typeof(NA)
## [1] "logical"
NULL == NULL
## logical(0)
NULL == NA
## logical(0)
NA == NA
## [1] NA
is.null(NULL)
## [1] TRUE
is.null(NA)
## [1] FALSE
is.na(NULL)
## Warning in is.na(NULL): is.na() applied to non-(list or vector) of type
## 'NULL'
## logical(0)
is.na(NA)
## [1] TRUE
# Extraction
mtcars[["mpg"]][1:5]
## [1] 21.0 21.0 22.8 21.4 18.7
mtcars[[2]][1:5]
## [1] 6 6 4 6 8
mtcars$hp[1:5]
## [1] 110 110 93 110 175
# Relevant lengths
seq_along(mtcars)
## [1] 1 2 3 4 5 6 7 8 9 10 11
x <- data.frame()
seq_along(x)
## integer(0)
length(seq_along(x))
## [1] 0
foo <- function(x) { for (eachCol in seq_along(x)) { print(typeof(x[[eachCol]])) }}
foo(mtcars)
## [1] "double"
## [1] "double"
## [1] "double"
## [1] "double"
## [1] "double"
## [1] "double"
## [1] "double"
## [1] "double"
## [1] "double"
## [1] "double"
## [1] "double"
foo(x) # Note that this does nothing!
data(airquality)
str(airquality)
## 'data.frame': 153 obs. of 6 variables:
## $ Ozone : int 41 36 12 18 NA 28 23 19 8 NA ...
## $ Solar.R: int 190 118 149 313 NA NA 299 99 19 194 ...
## $ Wind : num 7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
## $ Temp : int 67 72 74 62 56 66 65 59 61 69 ...
## $ Month : int 5 5 5 5 5 5 5 5 5 5 ...
## $ Day : int 1 2 3 4 5 6 7 8 9 10 ...
foo(airquality)
## [1] "integer"
## [1] "integer"
## [1] "double"
## [1] "integer"
## [1] "integer"
## [1] "integer"
# Range command
mpgRange <- range(mtcars$mpg)
mpgRange
## [1] 10.4 33.9
mpgScale <- (mtcars$mpg - mpgRange[1]) / (mpgRange[2] - mpgRange[1])
summary(mpgScale)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.2138 0.3745 0.4124 0.5277 1.0000
The typical arguments in a function use a consistent, simple naming function:
Data arguments should come before detail arguments, and detail arguments should be given reasonable default values. See for example rnorm(n, mean=0, sd=1). The number requested (n) must be specified, but defaults are available for the details (mean and standard deviation).
Functions can be passed as arguments to other functions, which is at the core of functional programming. For example:
do_math <- function(x, fun) { fun(x) }
do_math(1:10, fun=mean)
## [1] 5.5
do_math(1:10, fun=sd)
## [1] 3.02765
The library(purrr) takes advantage of this, and in a type-consistent manner. There are functions for:
The general arguments are .x (a list or an atomic vector) and .f which can be either a function, an anonymous function (formula with ~), or an extractor .x[[.f]]. For example:
library(purrr)
##
## Attaching package: 'purrr'
## The following objects are masked from 'package:dplyr':
##
## contains, order_by
library(RColorBrewer) # Need to have in non-cached chunk for later
data(mtcars)
# Create output as a list
map(.x=mtcars, .f=sum)
## $mpg
## [1] 642.9
##
## $cyl
## [1] 198
##
## $disp
## [1] 7383.1
##
## $hp
## [1] 4694
##
## $drat
## [1] 115.09
##
## $wt
## [1] 102.952
##
## $qsec
## [1] 571.16
##
## $vs
## [1] 14
##
## $am
## [1] 13
##
## $gear
## [1] 118
##
## $carb
## [1] 90
# Create same output as a double
map_dbl(.x=mtcars, .f=sum)
## mpg cyl disp hp drat wt qsec vs
## 642.900 198.000 7383.100 4694.000 115.090 102.952 571.160 14.000
## am gear carb
## 13.000 118.000 90.000
# Create same output as integer
# map_int(.x=mtcars, .f=sum) . . . this would bomb since it is not actually an integere
map_int(.x=mtcars, .f=function(x) { as.integer(round(sum(x), 0)) } )
## mpg cyl disp hp drat wt qsec vs am gear carb
## 643 198 7383 4694 115 103 571 14 13 118 90
# Same thing but using an anonymous function with ~ and .
map_int(.x=mtcars, .f = ~ as.integer(round(sum(.), 0)) )
## mpg cyl disp hp drat wt qsec vs am gear carb
## 643 198 7383 4694 115 103 571 14 13 118 90
# Create a boolean vector
map_lgl(.x=mtcars, .f = ~ ifelse(sum(.) > 200, TRUE, FALSE) )
## mpg cyl disp hp drat wt qsec vs am gear carb
## TRUE FALSE TRUE TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
# Create a character vector
map_chr(.x=mtcars, .f = ~ ifelse(sum(.) > 200, "Large", "Not So Large") )
## mpg cyl disp hp drat
## "Large" "Not So Large" "Large" "Large" "Not So Large"
## wt qsec vs am gear
## "Not So Large" "Large" "Not So Large" "Not So Large" "Not So Large"
## carb
## "Not So Large"
# Use the extractor [pulls the first row]
map_dbl(.x=mtcars, .f=1)
## mpg cyl disp hp drat wt qsec vs am gear
## 21.00 6.00 160.00 110.00 3.90 2.62 16.46 0.00 1.00 4.00
## carb
## 4.00
# Example from help file using chaining
mtcars %>%
split(.$cyl) %>%
map(~ lm(mpg ~ wt, data = .x)) %>%
map(summary) %>%
map_dbl("r.squared")
## 4 6 8
## 0.5086326 0.4645102 0.4229655
# Using sapply
sapply(split(mtcars, mtcars$cyl), FUN=function(.x) { summary(lm(mpg ~ wt, data=.x))$r.squared } )
## 4 6 8
## 0.5086326 0.4645102 0.4229655
# Use the extractor from a list
cylSplit <- split(mtcars, mtcars$cyl)
map(cylSplit, "mpg")
## $`4`
## [1] 22.8 24.4 22.8 32.4 30.4 33.9 21.5 27.3 26.0 30.4 21.4
##
## $`6`
## [1] 21.0 21.0 21.4 18.1 19.2 17.8 19.7
##
## $`8`
## [1] 18.7 14.3 16.4 17.3 15.2 10.4 10.4 14.7 15.5 15.2 13.3 19.2 15.8 15.0
map(cylSplit, "cyl")
## $`4`
## [1] 4 4 4 4 4 4 4 4 4 4 4
##
## $`6`
## [1] 6 6 6 6 6 6 6
##
## $`8`
## [1] 8 8 8 8 8 8 8 8 8 8 8 8 8 8
The purrr library has several additional interesting functions:
Some example code includes:
library(purrr) # Called again for clarity; all these key functions belong to purrr
# safely(.f, otherwise = NULL, quiet = TRUE)
safe_log10 <- safely(log10)
map(list(0, 1, 10, "a"), .f=safe_log10)
## [[1]]
## [[1]]$result
## [1] -Inf
##
## [[1]]$error
## NULL
##
##
## [[2]]
## [[2]]$result
## [1] 0
##
## [[2]]$error
## NULL
##
##
## [[3]]
## [[3]]$result
## [1] 1
##
## [[3]]$error
## NULL
##
##
## [[4]]
## [[4]]$result
## NULL
##
## [[4]]$error
## <simpleError in .f(...): non-numeric argument to mathematical function>
# possibly(.f, otherwise, quiet = TRUE)
poss_log10 <- possibly(log10, otherwise=NaN)
map_dbl(list(0, 1, 10, "a"), .f=poss_log10)
## [1] -Inf 0 1 NaN
# transpose() - note that this can become masked by data.table::transpose() so be careful
purrr::transpose(map(list(0, 1, 10, "a"), .f=safe_log10))
## $result
## $result[[1]]
## [1] -Inf
##
## $result[[2]]
## [1] 0
##
## $result[[3]]
## [1] 1
##
## $result[[4]]
## NULL
##
##
## $error
## $error[[1]]
## NULL
##
## $error[[2]]
## NULL
##
## $error[[3]]
## NULL
##
## $error[[4]]
## <simpleError in .f(...): non-numeric argument to mathematical function>
purrr::transpose(map(list(0, 1, 10, "a"), .f=safe_log10))$result
## [[1]]
## [1] -Inf
##
## [[2]]
## [1] 0
##
## [[3]]
## [1] 1
##
## [[4]]
## NULL
unlist(purrr::transpose(map(list(0, 1, 10, "a"), .f=safe_log10))$result)
## [1] -Inf 0 1
purrr::transpose(map(list(0, 1, 10, "a"), .f=safe_log10))$error
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
##
## [[4]]
## <simpleError in .f(...): non-numeric argument to mathematical function>
map_lgl(purrr::transpose(map(list(0, 1, 10, "a"), .f=safe_log10))$error, is.null)
## [1] TRUE TRUE TRUE FALSE
# map2(.x, .y, .f)
map2(list(5, 10, 20), list(1, 2, 3), .f=rnorm) # rnorm(5, 1), rnorm(10, 2), and rnorm(20, 3)
## [[1]]
## [1] 0.41176421 2.00652288 0.06152025 0.46963873 1.15436157
##
## [[2]]
## [1] 0.006821057 2.902712636 1.436150816 1.377836302 2.625075832
## [6] 0.680797806 0.313499192 0.718062969 2.820989906 3.134207742
##
## [[3]]
## [1] 3.3716474 2.9393673 1.8648940 3.2343343 2.1849894 2.0697179 1.0872014
## [8] 3.4970403 3.5769694 3.0999340 1.2033341 0.9839011 2.9820314 1.7116383
## [15] 0.8779558 1.6990118 2.5914013 2.3587803 3.7460957 1.2980312
# pmap(.l, .f)
pmap(list(n=list(5, 10, 20), mean=list(1, 5, 10), sd=list(0.1, 0.5, 0.1)), rnorm)
## [[1]]
## [1] 1.0151570 1.1573287 1.0628581 0.8805484 0.9418430
##
## [[2]]
## [1] 5.032920 4.689799 5.423525 5.265610 4.727383 5.252325 5.166292
## [8] 4.861745 5.135408 4.106679
##
## [[3]]
## [1] 9.854138 10.090939 10.045554 9.970755 10.092487 9.769531 10.140064
## [8] 9.834716 10.196817 10.047367 10.054093 10.006439 10.142002 10.092259
## [15] 10.222459 10.082440 10.067818 9.993884 10.078380 9.936942
# invoke_map(.f, .x, ...)
invoke_map(list(rnorm, runif, rexp), n=5)
## [[1]]
## [1] -0.96707137 0.08207476 1.39498168 0.60287972 -0.15130461
##
## [[2]]
## [1] 0.01087442 0.02980483 0.81443586 0.88438198 0.67976034
##
## [[3]]
## [1] 0.2646751 1.3233260 1.1079261 1.3504952 0.6795524
# walk() is for the side effects of a function
x <- list(1, "\n\ta\n", 3)
x %>% walk(cat)
## 1
## a
## 3
# Chaining is available by way of the %>% operator
pretty_titles <- c("N(0, 1)", "Uniform(0, 1)", "Exponential (rate=1)")
set.seed(1607120947)
x <- invoke_map(list(rnorm, runif, rexp), n=5000)
foo <- function(x) { map(x, .f=summary) }
par(mfrow=c(1, 3))
pwalk(list(x=x, main=pretty_titles), .f=hist, xlab="", col="light blue") %>% map(.f=foo)
## $x
## $x[[1]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.711000 -0.637800 -0.000217 0.006543 0.671800 3.633000
##
## $x[[2]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0001241 0.2518000 0.5012000 0.5028000 0.7566000 0.9999000
##
## $x[[3]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00001 0.29140 0.68340 0.98260 1.37900 8.46300
##
##
## $main
## $main[[1]]
## Length Class Mode
## 1 character character
##
## $main[[2]]
## Length Class Mode
## 1 character character
##
## $main[[3]]
## Length Class Mode
## 1 character character
par(mfrow=c(1, 1))
There are two potentially desirable behaviors with functions:
As a best practice, R functions that will be used for programming (as opposed to interactive command line work) should be written in a robust manner. Three standard problems should be avoided/mitigated:
There are several methods available for throwing errors within an R function:
One example that commonly creates surprises is the [,] operator for extraction. Adding [ , , drop=FALSE] ensures that you will still have what you passed (e.g., a matrix or data frame) rather than conversion of a chunk of data to a vector.
Another common source of error is sapply() which will return a vector when it can and a list otherwise. The map() and map_typ() functions in purrr are designed to be type-stable; if the output is not as expected, they will error out.
Non-standard evaluations take advantage of the existence of something else (e.g., a variable in the parent environment that has not been passed). This can cause confusion and improper results.
Pure functions have the key properties that 1) their output depends only on their inputs, and 2) they do not impact the outside world other than by way of their return value. Specifically, the function should not depend on how the user has configured their global options as shown in options(), nor should it modify those options() settings upon return of control to the parent environment.
A few examples are shown below:
# Throwing errors to stop a function (cannot actually run these!)
# stopifnot(FALSE)
# if (FALSE) { stop("Error: ", call.=FALSE) }
# if (FALSE) { stop("Error: This condition needed to be set as TRUE", call.=FALSE) }
# Behavior of [,] and [,,drop=FALSE]
mtxTest <- matrix(data=1:9, nrow=3, byrow=TRUE)
class(mtxTest)
## [1] "matrix"
mtxTest[1, ]
## [1] 1 2 3
class(mtxTest[1, ])
## [1] "integer"
mtxTest[1, , drop=FALSE]
## [,1] [,2] [,3]
## [1,] 1 2 3
class(mtxTest[1, , drop=FALSE])
## [1] "matrix"
# Behavior of sapply() - may not get what you are expecting
foo <- function(x) { x^2 }
sapply(1:5, FUN=foo)
## [1] 1 4 9 16 25
class(sapply(1:5, FUN=foo))
## [1] "numeric"
sapply(c(1, list(1.5, 2, 2.5), 3, 4, 5), FUN=foo)
## [1] 1.00 2.25 4.00 6.25 9.00 16.00 25.00
class(sapply(c(1, list(1.5, 2, 2.5), 3, 4, 5), FUN=foo))
## [1] "numeric"
sapply(list(1, c(1.5, 2, 2.5), 3, 4, 5), FUN=foo)
## [[1]]
## [1] 1
##
## [[2]]
## [1] 2.25 4.00 6.25
##
## [[3]]
## [1] 9
##
## [[4]]
## [1] 16
##
## [[5]]
## [1] 25
class(sapply(list(1, c(1.5, 2, 2.5), 3, 4, 5), FUN=foo))
## [1] "list"
This was a very enjoyable and instructive course.
Chapter 1 - Introduction to Object Oriented Programming (OOP)
Typical R usage involves a functional programming style - data to function to new data to new function to newer values and etc. Object Oriented Programming (OOP) instead involves thinking about the data structures (objects), their functionalities, and the like:
There are nine different options for OOP in R:
How does R distinguish types of variables?
Assigning Classes and Implicit Classes:
Example code includes:
# Create these variables
a_numeric_vector <- rlnorm(50)
a_factor <- factor(
sample(c(LETTERS[1:5], NA), 50, replace = TRUE)
)
a_data_frame <- data.frame(
n = a_numeric_vector,
f = a_factor
)
a_linear_model <- lm(dist ~ speed, cars)
# Call summary() on the numeric vector
summary(a_numeric_vector)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.08694 0.58120 1.06400 1.63500 1.48800 7.43600
# Do the same for the other three objects
summary(a_factor)
## A B C D E NA's
## 5 9 8 11 11 6
summary(a_data_frame)
## n f
## Min. :0.08694 A : 5
## 1st Qu.:0.58121 B : 9
## Median :1.06361 C : 8
## Mean :1.63546 D :11
## 3rd Qu.:1.48764 E :11
## Max. :7.43560 NA's: 6
summary(a_linear_model)
##
## Call:
## lm(formula = dist ~ speed, data = cars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -29.069 -9.525 -2.272 9.215 43.201
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.5791 6.7584 -2.601 0.0123 *
## speed 3.9324 0.4155 9.464 1.49e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.38 on 48 degrees of freedom
## Multiple R-squared: 0.6511, Adjusted R-squared: 0.6438
## F-statistic: 89.57 on 1 and 48 DF, p-value: 1.49e-12
type_info <-
function(x)
{
c(
class = class(x),
typeof = typeof(x),
mode = mode(x),
storage.mode = storage.mode(x)
)
}
# Create list of example variables
some_vars <- list(
an_integer_vector = rpois(24, lambda = 5),
a_numeric_vector = rbeta(24, shape1 = 1, shape2 = 1),
an_integer_array = array(rbinom(24, size = 8, prob = 0.5), dim = c(2, 3, 4)),
a_numeric_array = array(rweibull(24, shape = 1, scale = 1), dim = c(2, 3, 4)),
a_data_frame = data.frame(int = rgeom(24, prob = 0.5), num = runif(24)),
a_factor = factor(month.abb),
a_formula = y ~ x,
a_closure_function = mean,
a_builtin_function = length,
a_special_function = `if`
)
# Loop over some_vars calling type_info() on each element to explore them
lapply(some_vars, FUN=type_info)
## $an_integer_vector
## class typeof mode storage.mode
## "integer" "integer" "numeric" "integer"
##
## $a_numeric_vector
## class typeof mode storage.mode
## "numeric" "double" "numeric" "double"
##
## $an_integer_array
## class typeof mode storage.mode
## "array" "integer" "numeric" "integer"
##
## $a_numeric_array
## class typeof mode storage.mode
## "array" "double" "numeric" "double"
##
## $a_data_frame
## class typeof mode storage.mode
## "data.frame" "list" "list" "list"
##
## $a_factor
## class typeof mode storage.mode
## "factor" "integer" "numeric" "integer"
##
## $a_formula
## class typeof mode storage.mode
## "formula" "language" "call" "language"
##
## $a_closure_function
## class typeof mode storage.mode
## "function" "closure" "function" "function"
##
## $a_builtin_function
## class typeof mode storage.mode
## "function" "builtin" "function" "function"
##
## $a_special_function
## class typeof mode storage.mode
## "function" "special" "function" "function"
whiteChess <- list(king="g1", queen="h4", bishops=c("c2", "g5"), knights=character(0), rooks=c("f1", "f6"), pawns=c("a2", "b2", "d4", "e3", "g2", "h2"))
blackChess <- list(king="g8", queen="d7", bishops=c("b7", "e7"), knights=character(0), rooks=c("a6", "f8"), pawns=c("a5", "c3", "c4", "d5", "f7", "g6"))
chess <- list(white=whiteChess, black=blackChess)
# Explore the structure of chess
str(chess)
## List of 2
## $ white:List of 6
## ..$ king : chr "g1"
## ..$ queen : chr "h4"
## ..$ bishops: chr [1:2] "c2" "g5"
## ..$ knights: chr(0)
## ..$ rooks : chr [1:2] "f1" "f6"
## ..$ pawns : chr [1:6] "a2" "b2" "d4" "e3" ...
## $ black:List of 6
## ..$ king : chr "g8"
## ..$ queen : chr "d7"
## ..$ bishops: chr [1:2] "b7" "e7"
## ..$ knights: chr(0)
## ..$ rooks : chr [1:2] "a6" "f8"
## ..$ pawns : chr [1:6] "a5" "c3" "c4" "d5" ...
# Override the class of chess
class(chess) <- "chess_game"
# Is chess still a list?
is.list(chess)
## [1] TRUE
# How many pieces are left on the board?
length(unlist(chess))
## [1] 24
type_info(chess) # note that typeof(), mode(), and storage.mode() all remained as list
## class typeof mode storage.mode
## "chess_game" "list" "list" "list"
Chapter 2 - Using S3
Function overloading is the property of a function of input-dependent behavior:
Methodical Thinking - determining which methods are available for an S3 generic:
S3 and Primitive Functions:
Too Much Class:
Example code includes:
# Create get_n_elements
get_n_elements <- function(x, ...) {
UseMethod("get_n_elements")
}
# View get_n_elements
get_n_elements
## function(x, ...) {
## UseMethod("get_n_elements")
## }
# Create a data.frame method for get_n_elements
get_n_elements.data.frame <- function(x, ...) {
nrow(x) * ncol(x)
}
# Call the method on the sleep dataset
n_elements_sleep <- get_n_elements(sleep)
# View the result
n_elements_sleep
## [1] 60
# View pre-defined objects
# ls.str() ## Do not run, this can be a cluster with many variables loaded . . .
# Create a default method for get_n_elements
get_n_elements.default <- function(x, ...) {
length(unlist(x))
}
# Call the method on the ability.cov dataset
n_elements_ability.cov <- get_n_elements(ability.cov)
# Find methods for print
methods("print")
## [1] print.acf*
## [2] print.AES*
## [3] print.all_vars*
## [4] print.anova*
## [5] print.any_vars*
## [6] print.aov*
## [7] print.aovlist*
## [8] print.ar*
## [9] print.Arima*
## [10] print.arima0*
## [11] print.AsIs
## [12] print.aspell*
## [13] print.aspell_inspect_context*
## [14] print.bibentry*
## [15] print.Bibtex*
## [16] print.BoolResult*
## [17] print.browseVignettes*
## [18] print.by
## [19] print.bytes*
## [20] print.changedFiles*
## [21] print.check_code_usage_in_package*
## [22] print.check_compiled_code*
## [23] print.check_demo_index*
## [24] print.check_depdef*
## [25] print.check_details*
## [26] print.check_doi_db*
## [27] print.check_dotInternal*
## [28] print.check_make_vars*
## [29] print.check_nonAPI_calls*
## [30] print.check_package_code_assign_to_globalenv*
## [31] print.check_package_code_attach*
## [32] print.check_package_code_data_into_globalenv*
## [33] print.check_package_code_startup_functions*
## [34] print.check_package_code_syntax*
## [35] print.check_package_code_unload_functions*
## [36] print.check_package_compact_datasets*
## [37] print.check_package_CRAN_incoming*
## [38] print.check_package_datasets*
## [39] print.check_package_depends*
## [40] print.check_package_description*
## [41] print.check_package_description_encoding*
## [42] print.check_package_license*
## [43] print.check_packages_in_dir*
## [44] print.check_packages_in_dir_changes*
## [45] print.check_packages_used*
## [46] print.check_po_files*
## [47] print.check_Rd_contents*
## [48] print.check_Rd_line_widths*
## [49] print.check_Rd_metadata*
## [50] print.check_Rd_xrefs*
## [51] print.check_so_symbols*
## [52] print.check_T_and_F*
## [53] print.check_url_db*
## [54] print.check_vignette_index*
## [55] print.checkDocFiles*
## [56] print.checkDocStyle*
## [57] print.checkFF*
## [58] print.checkRd*
## [59] print.checkReplaceFuns*
## [60] print.checkS3methods*
## [61] print.checkTnF*
## [62] print.checkVignettes*
## [63] print.citation*
## [64] print.codoc*
## [65] print.codocClasses*
## [66] print.codocData*
## [67] print.colorConverter*
## [68] print.compactPDF*
## [69] print.condition
## [70] print.connection
## [71] print.CRAN_package_reverse_dependencies_and_views*
## [72] print.data.frame
## [73] print.Date
## [74] print.default
## [75] print.dendrogram*
## [76] print.density*
## [77] print.dictionary*
## [78] print.difftime
## [79] print.dist*
## [80] print.Dlist
## [81] print.DLLInfo
## [82] print.DLLInfoList
## [83] print.DLLRegisteredRoutines
## [84] print.dummy_coef*
## [85] print.dummy_coef_list*
## [86] print.ecdf*
## [87] print.element*
## [88] print.factanal*
## [89] print.factor
## [90] print.family*
## [91] print.fileSnapshot*
## [92] print.findLineNumResult*
## [93] print.flatGridListing*
## [94] print.formula*
## [95] print.frame*
## [96] print.fseq*
## [97] print.ftable*
## [98] print.fun_list*
## [99] print.function
## [100] print.getAnywhere*
## [101] print.ggplot*
## [102] print.ggplot2_bins*
## [103] print.ggproto*
## [104] print.ggproto_method*
## [105] print.gList*
## [106] print.glm*
## [107] print.glue*
## [108] print.gpar*
## [109] print.grob*
## [110] print.gtable*
## [111] print.hclust*
## [112] print.help_files_with_topic*
## [113] print.hexmode
## [114] print.HoltWinters*
## [115] print.hsearch*
## [116] print.hsearch_db*
## [117] print.htest*
## [118] print.html*
## [119] print.html_dependency*
## [120] print.indexed*
## [121] print.infl*
## [122] print.integrate*
## [123] print.isoreg*
## [124] print.kmeans*
## [125] print.knitr_kable*
## [126] print.Latex*
## [127] print.LaTeX*
## [128] print.lazy*
## [129] print.libraryIQR
## [130] print.listof
## [131] print.lm*
## [132] print.loadings*
## [133] print.location*
## [134] print.loess*
## [135] print.logLik*
## [136] print.ls_str*
## [137] print.medpolish*
## [138] print.MethodsFunction*
## [139] print.mtable*
## [140] print.NativeRoutineList
## [141] print.news_db*
## [142] print.nls*
## [143] print.noquote
## [144] print.numeric_version
## [145] print.object_size*
## [146] print.octmode
## [147] print.packageDescription*
## [148] print.packageInfo
## [149] print.packageIQR*
## [150] print.packageStatus*
## [151] print.pairwise.htest*
## [152] print.path*
## [153] print.PDF_Array*
## [154] print.PDF_Dictionary*
## [155] print.pdf_doc*
## [156] print.pdf_fonts*
## [157] print.PDF_Indirect_Reference*
## [158] print.pdf_info*
## [159] print.PDF_Keyword*
## [160] print.PDF_Name*
## [161] print.PDF_Stream*
## [162] print.PDF_String*
## [163] print.person*
## [164] print.POSIXct
## [165] print.POSIXlt
## [166] print.power.htest*
## [167] print.ppr*
## [168] print.prcomp*
## [169] print.princomp*
## [170] print.proc_time
## [171] print.quosure*
## [172] print.quoted*
## [173] print.R6*
## [174] print.R6ClassGenerator*
## [175] print.raster*
## [176] print.Rcpp_stack_trace*
## [177] print.Rd*
## [178] print.recordedplot*
## [179] print.rel*
## [180] print.restart
## [181] print.RGBcolorConverter*
## [182] print.rle
## [183] print.roman*
## [184] print.root_criterion*
## [185] print.rowwise_df*
## [186] print.SavedPlots*
## [187] print.sessionInfo*
## [188] print.shiny.tag*
## [189] print.shiny.tag.list*
## [190] print.simple.list
## [191] print.smooth.spline*
## [192] print.socket*
## [193] print.split*
## [194] print.src*
## [195] print.srcfile
## [196] print.srcref
## [197] print.stepfun*
## [198] print.stl*
## [199] print.StructTS*
## [200] print.subdir_tests*
## [201] print.summarize_CRAN_check_status*
## [202] print.summary.aov*
## [203] print.summary.aovlist*
## [204] print.summary.ecdf*
## [205] print.summary.glm*
## [206] print.summary.lm*
## [207] print.summary.loess*
## [208] print.summary.manova*
## [209] print.summary.nls*
## [210] print.summary.packageStatus*
## [211] print.summary.ppr*
## [212] print.summary.prcomp*
## [213] print.summary.princomp*
## [214] print.summary.table
## [215] print.summaryDefault
## [216] print.table
## [217] print.tables_aov*
## [218] print.tbl*
## [219] print.tbl_cube*
## [220] print.tbl_df*
## [221] print.terms*
## [222] print.theme*
## [223] print.trans*
## [224] print.trunc_mat*
## [225] print.ts*
## [226] print.tskernel*
## [227] print.TukeyHSD*
## [228] print.tukeyline*
## [229] print.tukeysmooth*
## [230] print.undoc*
## [231] print.uneval*
## [232] print.unit*
## [233] print.viewport*
## [234] print.vignette*
## [235] print.warnings
## [236] print.xgettext*
## [237] print.xngettext*
## [238] print.xtabs*
## see '?methods' for accessing help and source code
# Commented due to no dataset "hair" on my machine
# View the structure of hair
# str(hair)
# What primitive generics are available?
.S3PrimitiveGenerics
## [1] "anyNA" "as.character" "as.complex" "as.double"
## [5] "as.environment" "as.integer" "as.logical" "as.numeric"
## [9] "as.raw" "c" "dim" "dim<-"
## [13] "dimnames" "dimnames<-" "is.array" "is.finite"
## [17] "is.infinite" "is.matrix" "is.na" "is.nan"
## [21] "is.numeric" "length" "length<-" "levels<-"
## [25] "names" "names<-" "rep" "seq.int"
## [29] "xtfrm"
# Does length.hairstylist exist?
# exists("length.hairstylist")
# What is the length of hair?
# length(hair)
kitty <- "Miaow!"
# Assign classes
class(kitty) <- c("cat", "mammal", "character")
# Does kitty inherit from cat/mammal/character vector?
inherits(kitty, "cat")
## [1] TRUE
inherits(kitty, "mammal")
## [1] TRUE
inherits(kitty, "character")
## [1] TRUE
# Is kitty a character vector?
is.character(kitty)
## [1] TRUE
# Does kitty inherit from dog?
inherits(kitty, "dog")
## [1] FALSE
what_am_i <-
function(x, ...)
{
UseMethod("what_am_i")
}
# cat method
what_am_i.cat <- function(x, ...)
{
# Write a message
print("I'm a cat")
# Call NextMethod
NextMethod("what_am_i")
}
# mammal method
what_am_i.mammal <- function(x, ...)
{
# Write a message
print("I'm a mammal")
# Call NextMethod
NextMethod("what_am_i")
}
# character method
what_am_i.character <- function(x, ...)
{
# Write a message
print("I'm a character vector")
}
# Call what_am_i()
what_am_i(kitty)
## [1] "I'm a cat"
## [1] "I'm a mammal"
## [1] "I'm a character vector"
Chapter 3 - Using R6
Object factory - R6 provides a means of storing data and objects within the same variable:
Hiding Complexity with Encapsulation - should be able to use something even if the internal (hidden) functionality is very complicated:
Generally, data available in the “private” area of a class is not available to users:
Example code includes:
# Define microwave_oven_factory
microwave_oven_factory <- R6::R6Class(
"MicrowaveOven",
private=list(power_rating_watts=800)
)
# View the microwave_oven_factory
microwave_oven_factory
## <MicrowaveOven> object generator
## Public:
## clone: function (deep = FALSE)
## Private:
## power_rating_watts: 800
## Parent env: <environment: R_GlobalEnv>
## Locked objects: TRUE
## Locked class: FALSE
## Portable: TRUE
# Make a new microwave oven
microwave_oven <- microwave_oven_factory$new()
# Add a cook method to the factory definition
microwave_oven_factory <- R6::R6Class(
"MicrowaveOven",
private = list(
power_rating_watts = 800
),
public = list(
cook = function(time_seconds) {
Sys.sleep(time_seconds)
print("Your food is cooked!")
}
)
)
# Create microwave oven object
a_microwave_oven <- microwave_oven_factory$new()
# Call cook method for 1 second
a_microwave_oven$cook(time_seconds=1)
## [1] "Your food is cooked!"
# Add a close_door() method
microwave_oven_factory <- R6::R6Class(
"MicrowaveOven",
private = list(
power_rating_watts = 800,
door_is_open = FALSE
),
public = list(
cook = function(time_seconds) {
Sys.sleep(time_seconds)
print("Your food is cooked!")
},
open_door = function() {
private$door_is_open = TRUE
},
close_door = function() {
private$door_is_open = FALSE
}
)
)
# Add an initialize method
microwave_oven_factory <- R6::R6Class(
"MicrowaveOven",
private = list(
power_rating_watts = 800,
door_is_open = FALSE
),
public = list(
cook = function(time_seconds) {
Sys.sleep(time_seconds)
print("Your food is cooked!")
},
open_door = function() {
private$door_is_open = TRUE
},
close_door = function() {
private$door_is_open = FALSE
},
# Add initialize() method here
initialize = function(power_rating_watts, door_is_open) {
if (!missing(power_rating_watts)) {
private$power_rating_watts <- power_rating_watts
}
if (!missing(door_is_open)) {
private$door_is_open <- door_is_open
}
}
)
)
# Make a microwave
a_microwave_oven <- microwave_oven_factory$new(power_rating_watts=650, door_is_open=TRUE)
# Add a binding for power rating
microwave_oven_factory <- R6::R6Class(
"MicrowaveOven",
private = list(
..power_rating_watts = 800
),
active = list(
# add the binding here
power_rating_watts = function() {
private$..power_rating_watts
}
)
)
# Make a microwave
a_microwave_oven <- microwave_oven_factory$new()
# Get the power rating
a_microwave_oven$power_rating_watts
## [1] 800
# Add a binding for power rating
microwave_oven_factory <- R6::R6Class(
"MicrowaveOven",
private = list(
..power_rating_watts = 800,
..power_level_watts = 800
),
# Add active list containing an active binding
active=list(
power_level_watts = function(value) {
if (missing(value)) {
private$..power_level_watts
} else {
assertive.types::assert_is_a_number(value, severity="warning")
assertive.numbers::assert_all_are_in_closed_range(value,
lower=0,
upper=private$..power_rating_watts,
severity="warning"
)
private$..power_level_watts <- value
}
}
)
)
# Make a microwave
a_microwave_oven <- microwave_oven_factory$new()
# Get the power level
a_microwave_oven$power_level_watts
## [1] 800
# Try to set the power level to "400"
a_microwave_oven$power_level_watts <- "400"
## Warning in (function (value) : is_a_number : value is not of class
## 'numeric'; it has class 'character'.
## Warning: Coercing value to class 'numeric'.
# Try to set the power level to 1600 watts
a_microwave_oven$power_level_watts <- 1600
## Warning in (function (value) : is_in_closed_range : value are not all in the range [0,800].
## There was 1 failure:
## Position Value Cause
## 1 1 1600 too high
# Set the power level to 400 watts
a_microwave_oven$power_level_watts <- 400
Chapter 4 - R6 Inheritance
Inheritance is an attempt to avoid “copy and paste” from one class to another (dependent, fancier, or the like) class:
Extend or Override to create additional functionality:
Multiple Levels of Inheritance - a can inherit from b that inherited from c and the like:
Example code includes:
microwave_oven_factory <-
R6::R6Class("MicrowaveOven",
private=list(..power_rating_watts=800,
..power_level_watts=800,
..door_is_open=FALSE
),
public=list(cook=function(time) Sys.sleep(time),
open_door=function() private$..door_is_open <- TRUE,
close_door = function() private$..door_is_open <- FALSE
),
active=list(power_rating_watts=function() private$..power_rating_watts,
power_level_watts = function(value) {
if (missing(value)) {
private$..power_level_watts
} else {
private$..power_level_watts <-
max(0,
min(private$..power_rating_watts,
as.numeric(value)
)
)
}
}
)
)
# Explore the microwave oven class
microwave_oven_factory
## <MicrowaveOven> object generator
## Public:
## cook: function (time)
## open_door: function ()
## close_door: function ()
## clone: function (deep = FALSE)
## Active bindings:
## power_rating_watts: function ()
## power_level_watts: function (value)
## Private:
## ..power_rating_watts: 800
## ..power_level_watts: 800
## ..door_is_open: FALSE
## Parent env: <environment: R_GlobalEnv>
## Locked objects: TRUE
## Locked class: FALSE
## Portable: TRUE
# Define a fancy microwave class inheriting from microwave oven
fancy_microwave_oven_factory <- R6::R6Class(
"FancyMicrowaveOven",
inherit=microwave_oven_factory
)
# Explore microwave oven classes
microwave_oven_factory
## <MicrowaveOven> object generator
## Public:
## cook: function (time)
## open_door: function ()
## close_door: function ()
## clone: function (deep = FALSE)
## Active bindings:
## power_rating_watts: function ()
## power_level_watts: function (value)
## Private:
## ..power_rating_watts: 800
## ..power_level_watts: 800
## ..door_is_open: FALSE
## Parent env: <environment: R_GlobalEnv>
## Locked objects: TRUE
## Locked class: FALSE
## Portable: TRUE
fancy_microwave_oven_factory
## <FancyMicrowaveOven> object generator
## Inherits from: <microwave_oven_factory>
## Public:
## clone: function (deep = FALSE)
## Parent env: <environment: R_GlobalEnv>
## Locked objects: TRUE
## Locked class: FALSE
## Portable: TRUE
# Instantiate both types of microwave
a_microwave_oven <- microwave_oven_factory$new()
a_fancy_microwave <- fancy_microwave_oven_factory$new()
# Get power rating for each microwave
microwave_power_rating <- a_microwave_oven$power_level_watts
fancy_microwave_power_rating <- a_fancy_microwave$power_level_watts
# Verify that these are the same
identical(microwave_power_rating, fancy_microwave_power_rating)
## [1] TRUE
# Cook with each microwave
a_microwave_oven$cook(1)
a_fancy_microwave$cook(1)
# Explore microwave oven class
microwave_oven_factory
## <MicrowaveOven> object generator
## Public:
## cook: function (time)
## open_door: function ()
## close_door: function ()
## clone: function (deep = FALSE)
## Active bindings:
## power_rating_watts: function ()
## power_level_watts: function (value)
## Private:
## ..power_rating_watts: 800
## ..power_level_watts: 800
## ..door_is_open: FALSE
## Parent env: <environment: R_GlobalEnv>
## Locked objects: TRUE
## Locked class: FALSE
## Portable: TRUE
# Extend the class definition
fancy_microwave_oven_factory <- R6::R6Class(
"FancyMicrowaveOven",
inherit = microwave_oven_factory,
# Add a public list with a cook baked potato method
public = list(
cook_baked_potato=function() {
self$cook(3)
}
)
)
# Instantiate a fancy microwave
a_fancy_microwave <- fancy_microwave_oven_factory$new()
# Call the cook_baked_potato() method
a_fancy_microwave$cook_baked_potato()
# Explore microwave oven class
microwave_oven_factory
## <MicrowaveOven> object generator
## Public:
## cook: function (time)
## open_door: function ()
## close_door: function ()
## clone: function (deep = FALSE)
## Active bindings:
## power_rating_watts: function ()
## power_level_watts: function (value)
## Private:
## ..power_rating_watts: 800
## ..power_level_watts: 800
## ..door_is_open: FALSE
## Parent env: <environment: R_GlobalEnv>
## Locked objects: TRUE
## Locked class: FALSE
## Portable: TRUE
# Update the class definition
fancy_microwave_oven_factory <- R6::R6Class(
"FancyMicrowaveOven",
inherit = microwave_oven_factory,
# Add a public list with a cook method
public = list(
cook = function(time_seconds) {
super$cook(time_seconds)
message("Enjoy your dinner!")
}
)
)
# Instantiate a fancy microwave
a_fancy_microwave <- fancy_microwave_oven_factory$new()
# Call the cook() method
a_fancy_microwave$cook(1)
## Enjoy your dinner!
# Expose the parent functionality
fancy_microwave_oven_factory <- R6::R6Class(
"FancyMicrowaveOven",
inherit = microwave_oven_factory,
public = list(
cook_baked_potato = function() {
self$cook(3)
},
cook = function(time_seconds) {
super$cook(time_seconds)
message("Enjoy your dinner!")
}
),
# Add an active element with a super_ binding
active = list(
super_ = function() super
)
)
# Instantiate a fancy microwave
a_fancy_microwave <- fancy_microwave_oven_factory$new()
# Call the super_ binding
a_fancy_microwave$super_
## <environment: 0x000000000c28e4d0>
ascii_pizza_slice <- " __\n // \"\"--.._\n|| (_) _ \"-._\n|| _ (_) '-.\n|| (_) __..-'\n \\\\__..--\"\""
# Explore other microwaves
microwave_oven_factory
## <MicrowaveOven> object generator
## Public:
## cook: function (time)
## open_door: function ()
## close_door: function ()
## clone: function (deep = FALSE)
## Active bindings:
## power_rating_watts: function ()
## power_level_watts: function (value)
## Private:
## ..power_rating_watts: 800
## ..power_level_watts: 800
## ..door_is_open: FALSE
## Parent env: <environment: R_GlobalEnv>
## Locked objects: TRUE
## Locked class: FALSE
## Portable: TRUE
fancy_microwave_oven_factory
## <FancyMicrowaveOven> object generator
## Inherits from: <microwave_oven_factory>
## Public:
## cook_baked_potato: function ()
## cook: function (time_seconds)
## clone: function (deep = FALSE)
## Active bindings:
## super_: function ()
## Parent env: <environment: R_GlobalEnv>
## Locked objects: TRUE
## Locked class: FALSE
## Portable: TRUE
# Define a high-end microwave oven class
high_end_microwave_oven_factory <- R6::R6Class(
"HighEndMicrowaveOven",
inherit=fancy_microwave_oven_factory,
public=list(
cook=function(time_seconds) {
super$super_$cook(time_seconds)
message(ascii_pizza_slice)
}
)
)
# Instantiate a high-end microwave oven
a_high_end_microwave <- high_end_microwave_oven_factory$new()
# Use it to cook for one second
a_high_end_microwave$cook(1)
## __
## // ""--.._
## || (_) _ "-._
## || _ (_) '-.
## || (_) __..-'
## \\__..--""
Chapter 5 - Advanced R6 Usage
Environments, Reference Behavior, and Static Fields:
Cloning Objects - R6 is built using environments, so the “copy by reference” is part and parcel of R6:
Shut it Down - if the R6 object is linked to any databases or has any side effects, it can be a good idea to shut it down:
Example code includes:
# Define a new environment
env <- new.env()
# Add an element named perfect
env$perfect <- c(6, 28, 496)
# Add an element named bases
env[["bases"]] <- c("A", "C", "G", "T")
# Assign lst and env
lst <- list(
perfect = c(6, 28, 496),
bases = c("A", "C", "G", "T")
)
env <- list2env(lst)
# Copy lst
lst2 <- lst
# Change lst's bases element
lst$bases <- c("A", "C", "G", "U")
# Test lst and lst2 identical
identical(lst$bases, lst2$bases)
## [1] FALSE
# Copy env
env2 <- env
# Change env's bases element
env$bases <- c("A", "C", "G", "U")
# Test env and env2 identical
identical(env$bases, env2$bases)
## [1] TRUE
# Complete the class definition
env_microwave_oven_factory <- R6::R6Class(
"MicrowaveOven",
private = list(
shared = {
# Create a new environment named e
e <- new.env()
# Assign safety_warning into e
e$safety_warning <- "Warning. Do not try to cook metal objects."
# Return e
e
}
),
active = list(
# Add the safety_warning binding
safety_warning = function(value) {
if (missing(value)) {
private$shared$safety_warning
} else {
private$shared$safety_warning <- value
}
}
)
)
# Create two microwave ovens
a_microwave_oven <- env_microwave_oven_factory$new()
another_microwave_oven <- env_microwave_oven_factory$new()
# Change the safety warning for a_microwave_oven
a_microwave_oven$safety_warning <- "Warning. If the food is too hot you may scald yourself."
# Verify that the warning has change for another_microwave
another_microwave_oven$safety_warning
## [1] "Warning. If the food is too hot you may scald yourself."
# Still uses microwave_oven_factory as defined in Chapter 4
# Create a microwave oven
a_microwave_oven <- microwave_oven_factory$new()
# Copy a_microwave_oven using <-
assigned_microwave_oven <- a_microwave_oven
# Copy a_microwave_oven using clone()
cloned_microwave_oven <- a_microwave_oven$clone()
# Change a_microwave_oven's power level
a_microwave_oven$power_level_watts <- 400
# Check a_microwave_oven & assigned_microwave_oven same
identical(a_microwave_oven$power_level_watts, assigned_microwave_oven$power_level_watts)
## [1] TRUE
# Check a_microwave_oven & cloned_microwave_oven different
!identical(a_microwave_oven$power_level_watts, cloned_microwave_oven$power_level_watts)
## [1] TRUE
# Commented, due to never defined power_plug
# Create a microwave oven
# a_microwave_oven <- microwave_oven_factory$new()
# Look at its power plug
# a_microwave_oven$power_plug
# Copy a_microwave_oven using clone(), no args
# cloned_microwave_oven <- a_microwave_oven$clone()
# Copy a_microwave_oven using clone(), deep = TRUE
# deep_cloned_microwave_oven <- a_microwave_oven$clone(deep=TRUE)
# Change a_microwave_oven's power plug type
# a_microwave_oven$power_plug$type <- "British"
# Check a_microwave_oven & cloned_microwave_oven same
# identical(a_microwave_oven$power_plug$type, cloned_microwave_oven$power_plug$type)
# Check a_microwave_oven & deep_cloned_microwave_oven different
# !identical(a_microwave_oven$power_plug$type, deep_cloned_microwave_oven$power_plug$type)
# Commented due to not having this SQL database
# Microwave_factory is pre-defined
# microwave_oven_factory
# Complete the class definition
# smart_microwave_oven_factory <- R6::R6Class(
# "SmartMicrowaveOven",
# inherit = microwave_oven_factory, # Specify inheritance
# private = list(
# conn = NULL
# ),
# public = list(
# initialize = function() {
# # Connect to the database
# private$conn = dbConnect(SQLite(), "cooking-times.sqlite")
# },
# get_cooking_time = function(food) {
# dbGetQuery(
# private$conn,
# sprintf("SELECT time_seconds FROM cooking_times WHERE food = '%s'", food)
# )
# },
# finalize = function() {
# message("Disconnecting from the cooking times database.")
# dbDisconnect(private$conn)
# }
# )
# )
# Create a smart microwave object
# a_smart_microwave <- smart_microwave_oven_factory$new()
# Call the get_cooking_time() method
# a_smart_microwave$get_cooking_time("soup")
# Remove the smart microwave
# rm(a_smart_microwave)
# Force garbage collection
# gc()
A nice introduction to S3 and R6.
Chapter 1 - What is Machine Learning?
Machine learning is the process of constructing and using algorithms that learn from data:
Classification, Regression, Clustering are three common forms of machine learning problems:
Supervised vs Unsupervised Learning:
Example code includes:
data(iris, package="datasets")
# Reveal number of observations and variables in two different ways
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
dim(iris)
## [1] 150 5
# Show first and last observations in the iris data set
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
tail(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 145 6.7 3.3 5.7 2.5 virginica
## 146 6.7 3.0 5.2 2.3 virginica
## 147 6.3 2.5 5.0 1.9 virginica
## 148 6.5 3.0 5.2 2.0 virginica
## 149 6.2 3.4 5.4 2.3 virginica
## 150 5.9 3.0 5.1 1.8 virginica
# Summarize the iris data set
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
data(Wage, package="ISLR")
# Build Linear Model: lm_wage (coded already)
lm_wage <- lm(wage ~ age, data = Wage)
# Define data.frame: unseen (coded already)
unseen <- data.frame(age = 60)
# Predict the wage for a 60-year old worker
predict(lm_wage, unseen)
## 1
## 124.1413
emails <- data.frame(
avg_capital_seq=c( 1, 2.11, 4.12, 1.86, 2.97, 1.69, 5.891, 3.17, 1.23, 2.44, 3.56, 3.25, 1.33 ),
spam=as.integer(c( 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1 ))
)
str(emails)
## 'data.frame': 13 obs. of 2 variables:
## $ avg_capital_seq: num 1 2.11 4.12 1.86 2.97 ...
## $ spam : int 0 0 1 0 1 0 1 0 0 1 ...
# Show the dimensions of emails
dim(emails)
## [1] 13 2
# Inspect definition of spam_classifier()
spam_classifier <- function(x){
prediction <- rep(NA, length(x)) # initialize prediction vector
prediction[x > 4] <- 1
prediction[x >= 3 & x <= 4] <- 0
prediction[x >= 2.2 & x < 3] <- 1
prediction[x >= 1.4 & x < 2.2] <- 0
prediction[x > 1.25 & x < 1.4] <- 1
prediction[x <= 1.25] <- 0
return(prediction) # prediction is either 0 or 1
}
# Apply the classifier to the avg_capital_seq column: spam_pred
spam_pred <- spam_classifier(emails$avg_capital_seq)
# Compare spam_pred to emails$spam. Use ==
spam_pred == emails$spam
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
linkedin <- c( 5, 7, 4, 9, 11, 10, 14, 17, 13, 11, 18, 17, 21, 21, 24, 23, 28, 35, 21, 27, 23 )
# Create the days vector
days <- 1:length(linkedin)
# Fit a linear model called on the linkedin views per day: linkedin_lm
linkedin_lm <- lm(linkedin ~ days)
# Predict the number of views for the next three days: linkedin_pred
future_days <- data.frame(days = 22:24)
linkedin_pred <- predict(linkedin_lm, future_days)
# Plot historical data and predictions
plot(linkedin ~ days, xlim = c(1, 24))
points(22:24, linkedin_pred, col = "green")
# Chop up iris in my_iris and species
my_iris <- iris[-5]
species <- iris$Species
# Perform k-means clustering on my_iris: kmeans_iris
kmeans_iris <- kmeans(my_iris, 3)
# Compare the actual Species to the clustering using table()
table(kmeans_iris$cluster, species)
## species
## setosa versicolor virginica
## 1 50 0 0
## 2 0 2 36
## 3 0 48 14
# Plot Petal.Width against Petal.Length, coloring by cluster
plot(Petal.Length ~ Petal.Width, data = my_iris, col = kmeans_iris$cluster)
# Take a look at the iris dataset
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
# A decision tree model has been built for you
tree <- rpart::rpart(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width,
data = iris, method = "class")
# A dataframe containing unseen observations
unseen <- data.frame(Sepal.Length = c(5.3, 7.2),
Sepal.Width = c(2.9, 3.9),
Petal.Length = c(1.7, 5.4),
Petal.Width = c(0.8, 2.3)
)
# Predict the label of the unseen observations. Print out the result.
predict(tree, unseen, type="class")
## 1 2
## setosa virginica
## Levels: setosa versicolor virginica
data(mtcars, package="datasets")
cars <- mtcars[,c("wt", "hp")]
str(cars)
## 'data.frame': 32 obs. of 2 variables:
## $ wt: num 2.62 2.88 2.32 3.21 3.44 ...
## $ hp: num 110 110 93 110 175 105 245 62 95 123 ...
# Explore the cars dataset
str(cars)
## 'data.frame': 32 obs. of 2 variables:
## $ wt: num 2.62 2.88 2.32 3.21 3.44 ...
## $ hp: num 110 110 93 110 175 105 245 62 95 123 ...
summary(cars)
## wt hp
## Min. :1.513 Min. : 52.0
## 1st Qu.:2.581 1st Qu.: 96.5
## Median :3.325 Median :123.0
## Mean :3.217 Mean :146.7
## 3rd Qu.:3.610 3rd Qu.:180.0
## Max. :5.424 Max. :335.0
# Group the dataset into two clusters: km_cars
km_cars <- kmeans(cars, 2)
# Print out the contents of each cluster
km_cars$cluster
## Mazda RX4 Mazda RX4 Wag Datsun 710
## 2 2 2
## Hornet 4 Drive Hornet Sportabout Valiant
## 2 1 2
## Duster 360 Merc 240D Merc 230
## 1 2 2
## Merc 280 Merc 280C Merc 450SE
## 2 2 1
## Merc 450SL Merc 450SLC Cadillac Fleetwood
## 1 1 1
## Lincoln Continental Chrysler Imperial Fiat 128
## 1 1 2
## Honda Civic Toyota Corolla Toyota Corona
## 2 2 2
## Dodge Challenger AMC Javelin Camaro Z28
## 2 2 1
## Pontiac Firebird Fiat X1-9 Porsche 914-2
## 1 2 2
## Lotus Europa Ford Pantera L Ferrari Dino
## 2 1 1
## Maserati Bora Volvo 142E
## 1 2
# Group the dataset into two clusters: km_cars
km_cars <- kmeans(cars, 2)
# Add code: color the points in the plot based on the clusters
plot(cars, col=km_cars$cluster)
# Print out the cluster centroids
km_cars$centers
## wt hp
## 1 2.692000 99.47368
## 2 3.984923 215.69231
# Replace the ___ part: add the centroids to the plot
points(km_cars$centers, pch = 22, bg = c(1, 2), cex = 2)
Chapter 2 - Performance Measures
Measuring model performance or error - is the model good?
Training set and test set - power is about the ability to make predictions about unseen data:
Bias and variance are the main error sources for a predictive model:
Example code includes:
library(dplyr)
data(titanic_train, package="titanic")
titanic <- titanic_train %>%
select(Survived, Pclass, Sex, Age) %>%
mutate(Survived=factor(Survived, levels=c(1, 0)), Sex=factor(Sex)) %>%
na.omit()
# Have a look at the structure of titanic
str(titanic)
## 'data.frame': 714 obs. of 4 variables:
## $ Survived: Factor w/ 2 levels "1","0": 2 1 1 1 2 2 2 1 1 1 ...
## $ Pclass : int 3 1 3 1 3 1 3 3 2 3 ...
## $ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 1 1 1 ...
## $ Age : num 22 38 26 35 35 54 2 27 14 4 ...
## - attr(*, "na.action")=Class 'omit' Named int [1:177] 6 18 20 27 29 30 32 33 37 43 ...
## .. ..- attr(*, "names")= chr [1:177] "6" "18" "20" "27" ...
# A decision tree classification model is built on the data
tree <- rpart::rpart(Survived ~ ., data = titanic, method = "class")
# Use the predict() method to make predictions, assign to pred
pred <- predict(tree, titanic, type="class")
# Use the table() method to make the confusion matrix
(conf <- table(titanic$Survived, pred))
## pred
## 1 0
## 1 212 78
## 0 53 371
# Assign TP, FN, FP and TN using conf
TP <- conf[1, 1] # this will be 212
FN <- conf[1, 2] # this will be 78
FP <- conf[2, 1] # fill in
TN <- conf[2, 2] # fill in
# Calculate and print the accuracy: acc
(acc <- sum(TP, TN) / sum(conf))
## [1] 0.8165266
# Calculate and print out the precision: prec
(prec <- TP / (TP + FP))
## [1] 0.8
# Calculate and print out the recall: rec
(rec <- TP / (TP + FN))
## [1] 0.7310345
# DO NOT HAVE THIS DATASET
# Take a look at the structure of air
# str(air)
# Inspect your colleague's code to build the model
# fit <- lm(dec ~ freq + angle + ch_length, data = air)
# Use the model to predict for all values: pred
# pred <- predict(fit, air)
# Use air$dec and pred to calculate the RMSE
# rmse <- sqrt( mean((air$dec-pred)^2) )
# Print out rmse
# rmse
# Previous model
# fit <- lm(dec ~ freq + angle + ch_length, data = air)
# pred <- predict(fit)
# rmse <- sqrt(sum( (air$dec - pred) ^ 2) / nrow(air))
# rmse
# Your colleague's more complex model
# fit2 <- lm(dec ~ freq + angle + ch_length + velocity + thickness, data = air)
# Use the model to predict for all values: pred2
# pred2 <- predict(fit2)
# Calculate rmse2
# rmse2 <- sqrt(sum( (air$dec - pred2) ^ 2) / nrow(air))
# Print out rmse2
# rmse2
# ALSO DO NOT HAVE THIS DATASET, THOUGH IT IS AVAILABLE ON UCI
# Explore the structure of the dataset
seeds <- read.delim("seeds.txt", header=FALSE,
col.names=c("area", "perimeter", "compactness", "length",
"width", "asymmetry", "groove", "type"
)
)
str(seeds)
## 'data.frame': 210 obs. of 8 variables:
## $ area : num 15.3 14.9 14.3 13.8 16.1 ...
## $ perimeter : num 14.8 14.6 14.1 13.9 15 ...
## $ compactness: num 0.871 0.881 0.905 0.895 0.903 ...
## $ length : num 5.76 5.55 5.29 5.32 5.66 ...
## $ width : num 3.31 3.33 3.34 3.38 3.56 ...
## $ asymmetry : num 2.22 1.02 2.7 2.26 1.35 ...
## $ groove : num 5.22 4.96 4.83 4.8 5.17 ...
## $ type : int 1 1 1 1 1 1 1 1 1 1 ...
# Group the seeds in three clusters
km_seeds <- kmeans(seeds[,-8], 3)
# Color the points in the plot based on the clusters
plot(length ~ compactness, data = seeds, col=km_seeds$cluster)
# Print out the ratio of the WSS to the BSS
with(km_seeds, tot.withinss / betweenss)
## [1] 0.2762846
# Shuffle the dataset, call the result shuffled
n <- nrow(titanic)
shuffled <- titanic[sample(n),]
# Split the data in train and test
train_indices <- 1:round(0.7*n)
train <- shuffled[train_indices, ]
test <- shuffled[-train_indices, ]
# Print the structure of train and test
str(train)
## 'data.frame': 500 obs. of 4 variables:
## $ Survived: Factor w/ 2 levels "1","0": 2 2 2 2 2 2 1 1 1 1 ...
## $ Pclass : int 3 2 3 3 3 3 1 3 1 3 ...
## $ Sex : Factor w/ 2 levels "female","male": 2 2 2 1 2 2 1 2 2 1 ...
## $ Age : num 17 25 25 2 2 70.5 33 29 36 5 ...
## - attr(*, "na.action")=Class 'omit' Named int [1:177] 6 18 20 27 29 30 32 33 37 43 ...
## .. ..- attr(*, "names")= chr [1:177] "6" "18" "20" "27" ...
str(test)
## 'data.frame': 214 obs. of 4 variables:
## $ Survived: Factor w/ 2 levels "1","0": 2 1 2 2 2 2 2 2 1 1 ...
## $ Pclass : int 1 3 3 3 2 3 1 2 3 3 ...
## $ Sex : Factor w/ 2 levels "female","male": 1 2 2 1 2 1 2 2 2 2 ...
## $ Age : num 2 29 32 14 31 18 37 36.5 21 32 ...
## - attr(*, "na.action")=Class 'omit' Named int [1:177] 6 18 20 27 29 30 32 33 37 43 ...
## .. ..- attr(*, "names")= chr [1:177] "6" "18" "20" "27" ...
# Fill in the model that has been learned.
tree <- rpart::rpart(Survived ~ ., data=train, method = "class")
# Predict the outcome on the test set with tree: pred
pred <- predict(tree, newdata=test, type="class")
# Calculate the confusion matrix: conf
(conf <- table(test$Survived, pred))
## pred
## 1 0
## 1 62 32
## 0 21 99
# Initialize the accs vector
accs <- rep(0,6)
for (i in 1:6) {
# These indices indicate the interval of the test set
indices <- (((i-1) * round((1/6)*nrow(shuffled))) + 1):((i*round((1/6) * nrow(shuffled))))
# Exclude them from the train set
train <- shuffled[-indices,]
# Include them in the test set
test <- shuffled[indices,]
# A model is learned using each training set
tree <- rpart::rpart(Survived ~ ., train, method = "class")
# Make a prediction on the test set using tree
pred <- predict(tree, newdata=test, type="class")
# Assign the confusion matrix to conf
conf <- table(test$Survived, pred)
# Assign the accuracy of this model to the ith index in accs
accs[i] <- sum(diag(conf))/sum(conf)
}
# Print out the mean of accs
mean(accs)
## [1] 0.7955182
data(spam, package="kernlab")
str(spam)
## 'data.frame': 4601 obs. of 58 variables:
## $ make : num 0 0.21 0.06 0 0 0 0 0 0.15 0.06 ...
## $ address : num 0.64 0.28 0 0 0 0 0 0 0 0.12 ...
## $ all : num 0.64 0.5 0.71 0 0 0 0 0 0.46 0.77 ...
## $ num3d : num 0 0 0 0 0 0 0 0 0 0 ...
## $ our : num 0.32 0.14 1.23 0.63 0.63 1.85 1.92 1.88 0.61 0.19 ...
## $ over : num 0 0.28 0.19 0 0 0 0 0 0 0.32 ...
## $ remove : num 0 0.21 0.19 0.31 0.31 0 0 0 0.3 0.38 ...
## $ internet : num 0 0.07 0.12 0.63 0.63 1.85 0 1.88 0 0 ...
## $ order : num 0 0 0.64 0.31 0.31 0 0 0 0.92 0.06 ...
## $ mail : num 0 0.94 0.25 0.63 0.63 0 0.64 0 0.76 0 ...
## $ receive : num 0 0.21 0.38 0.31 0.31 0 0.96 0 0.76 0 ...
## $ will : num 0.64 0.79 0.45 0.31 0.31 0 1.28 0 0.92 0.64 ...
## $ people : num 0 0.65 0.12 0.31 0.31 0 0 0 0 0.25 ...
## $ report : num 0 0.21 0 0 0 0 0 0 0 0 ...
## $ addresses : num 0 0.14 1.75 0 0 0 0 0 0 0.12 ...
## $ free : num 0.32 0.14 0.06 0.31 0.31 0 0.96 0 0 0 ...
## $ business : num 0 0.07 0.06 0 0 0 0 0 0 0 ...
## $ email : num 1.29 0.28 1.03 0 0 0 0.32 0 0.15 0.12 ...
## $ you : num 1.93 3.47 1.36 3.18 3.18 0 3.85 0 1.23 1.67 ...
## $ credit : num 0 0 0.32 0 0 0 0 0 3.53 0.06 ...
## $ your : num 0.96 1.59 0.51 0.31 0.31 0 0.64 0 2 0.71 ...
## $ font : num 0 0 0 0 0 0 0 0 0 0 ...
## $ num000 : num 0 0.43 1.16 0 0 0 0 0 0 0.19 ...
## $ money : num 0 0.43 0.06 0 0 0 0 0 0.15 0 ...
## $ hp : num 0 0 0 0 0 0 0 0 0 0 ...
## $ hpl : num 0 0 0 0 0 0 0 0 0 0 ...
## $ george : num 0 0 0 0 0 0 0 0 0 0 ...
## $ num650 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ lab : num 0 0 0 0 0 0 0 0 0 0 ...
## $ labs : num 0 0 0 0 0 0 0 0 0 0 ...
## $ telnet : num 0 0 0 0 0 0 0 0 0 0 ...
## $ num857 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ data : num 0 0 0 0 0 0 0 0 0.15 0 ...
## $ num415 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ num85 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ technology : num 0 0 0 0 0 0 0 0 0 0 ...
## $ num1999 : num 0 0.07 0 0 0 0 0 0 0 0 ...
## $ parts : num 0 0 0 0 0 0 0 0 0 0 ...
## $ pm : num 0 0 0 0 0 0 0 0 0 0 ...
## $ direct : num 0 0 0.06 0 0 0 0 0 0 0 ...
## $ cs : num 0 0 0 0 0 0 0 0 0 0 ...
## $ meeting : num 0 0 0 0 0 0 0 0 0 0 ...
## $ original : num 0 0 0.12 0 0 0 0 0 0.3 0 ...
## $ project : num 0 0 0 0 0 0 0 0 0 0.06 ...
## $ re : num 0 0 0.06 0 0 0 0 0 0 0 ...
## $ edu : num 0 0 0.06 0 0 0 0 0 0 0 ...
## $ table : num 0 0 0 0 0 0 0 0 0 0 ...
## $ conference : num 0 0 0 0 0 0 0 0 0 0 ...
## $ charSemicolon : num 0 0 0.01 0 0 0 0 0 0 0.04 ...
## $ charRoundbracket : num 0 0.132 0.143 0.137 0.135 0.223 0.054 0.206 0.271 0.03 ...
## $ charSquarebracket: num 0 0 0 0 0 0 0 0 0 0 ...
## $ charExclamation : num 0.778 0.372 0.276 0.137 0.135 0 0.164 0 0.181 0.244 ...
## $ charDollar : num 0 0.18 0.184 0 0 0 0.054 0 0.203 0.081 ...
## $ charHash : num 0 0.048 0.01 0 0 0 0 0 0.022 0 ...
## $ capitalAve : num 3.76 5.11 9.82 3.54 3.54 ...
## $ capitalLong : num 61 101 485 40 40 15 4 11 445 43 ...
## $ capitalTotal : num 278 1028 2259 191 191 ...
## $ type : Factor w/ 2 levels "nonspam","spam": 2 2 2 2 2 2 2 2 2 2 ...
emails_full <- spam %>%
select(capitalAve, type) %>%
mutate(avg_capital_seq=capitalAve, spam=factor(as.integer(type)-1, levels=c(1, 0))) %>%
select(avg_capital_seq, spam)
str(emails_full)
## 'data.frame': 4601 obs. of 2 variables:
## $ avg_capital_seq: num 3.76 5.11 9.82 3.54 3.54 ...
## $ spam : Factor w/ 2 levels "1","0": 1 1 1 1 1 1 1 1 1 1 ...
# The spam filter that has been 'learned' for you
spam_classifier <- function(x){
prediction <- rep(NA, length(x)) # initialize prediction vector
prediction[x > 4] <- 1
prediction[x >= 3 & x <= 4] <- 0
prediction[x >= 2.2 & x < 3] <- 1
prediction[x >= 1.4 & x < 2.2] <- 0
prediction[x > 1.25 & x < 1.4] <- 1
prediction[x <= 1.25] <- 0
return(factor(prediction, levels = c("1", "0"))) # prediction is either 0 or 1
}
# Apply spam_classifier to emails_full: pred_full
pred_full <- spam_classifier(emails_full$avg_capital_seq)
# Build confusion matrix for emails_full: conf_full
conf_full <- table(emails_full$spam, pred_full)
# Calculate the accuracy with conf_full: acc_full
(acc_full <- sum(diag(conf_full)) / sum(conf_full))
## [1] 0.6561617
emails_small <- data.frame(avg_capital_seq=c( 1, 2.112, 4.123, 1.863, 2.973, 1.687, 5.891,
3.167, 1.23, 2.441, 3.555, 3.25, 1.333
),
spam=factor(c(0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1), levels=c(1, 0))
)
str(emails_small)
## 'data.frame': 13 obs. of 2 variables:
## $ avg_capital_seq: num 1 2.11 4.12 1.86 2.97 ...
## $ spam : Factor w/ 2 levels "1","0": 2 2 1 2 1 2 1 2 2 1 ...
spam_classifier <- function(x){
prediction <- rep(NA, length(x))
prediction[x > 4] <- 1
prediction[x <= 4] <- 0
return(factor(prediction, levels = c("1", "0")))
}
# conf_small and acc_small have been calculated for you
conf_small <- table(emails_small$spam, spam_classifier(emails_small$avg_capital_seq))
acc_small <- sum(diag(conf_small)) / sum(conf_small)
acc_small
## [1] 0.7692308
# Apply spam_classifier to emails_full and calculate the confusion matrix: conf_full
conf_full <- table(emails_full$spam, spam_classifier(emails_full$avg_capital_seq))
# Calculate acc_full
(acc_full <- sum(diag(conf_full)) / sum(conf_full))
## [1] 0.7259291
Chapter 3 - Classification
Decision trees - assign class to an unseen observation (each observation consists of a vector of features, and a classification):
K-nearest-neighbors (knn) - an example of “instance based learning”:
ROC curve - Receiver Operator Characteristic curve - is a powerful performance measure for binary classification:
Example code includes:
titanic <- titanic_train %>%
select(Survived, Pclass, Sex, Age) %>%
mutate(Survived=factor(Survived, levels=c(1, 0)), Sex=factor(Sex), Pclass=factor(Pclass)) %>%
na.omit()
trIdx <- sample(x=1:nrow(titanic), size=round(.7*nrow(titanic)), replace=FALSE)
train <- titanic[trIdx, ]
test <- titanic[-trIdx, ]
str(train); str(test)
## 'data.frame': 500 obs. of 4 variables:
## $ Survived: Factor w/ 2 levels "1","0": 2 2 1 1 1 2 2 1 2 2 ...
## $ Pclass : Factor w/ 3 levels "1","2","3": 2 3 2 1 1 2 3 3 3 3 ...
## $ Sex : Factor w/ 2 levels "female","male": 2 2 2 1 1 2 2 1 2 2 ...
## $ Age : num 36 25 62 15 21 57 19 35 31 38 ...
## - attr(*, "na.action")=Class 'omit' Named int [1:177] 6 18 20 27 29 30 32 33 37 43 ...
## .. ..- attr(*, "names")= chr [1:177] "6" "18" "20" "27" ...
## 'data.frame': 214 obs. of 4 variables:
## $ Survived: Factor w/ 2 levels "1","0": 1 2 1 1 1 2 1 1 1 2 ...
## $ Pclass : Factor w/ 3 levels "1","2","3": 1 1 3 3 1 3 2 2 1 1 ...
## $ Sex : Factor w/ 2 levels "female","male": 1 2 1 1 1 1 1 2 2 2 ...
## $ Age : num 35 54 27 4 58 14 55 34 28 40 ...
## - attr(*, "na.action")=Class 'omit' Named int [1:177] 6 18 20 27 29 30 32 33 37 43 ...
## .. ..- attr(*, "names")= chr [1:177] "6" "18" "20" "27" ...
# Fill in the ___, build a tree model: tree
tree <- rpart::rpart(Survived ~ ., data=train, method="class")
# Draw the decision tree
rattle::fancyRpartPlot(tree)
# Predict the values of the test set: pred
pred <- predict(tree, newdata=test, type="class")
# Construct the confusion matrix: conf
(conf <- table(test$Survived, pred))
## pred
## 1 0
## 1 71 24
## 0 22 97
# Print out the accuracy
sum(diag(conf)) / sum(conf)
## [1] 0.7850467
# Calculation of a complex tree
tree <- rpart::rpart(Survived ~ ., train, method = "class", control = rpart::rpart.control(cp=0.00001))
# Draw the complex tree
rattle::fancyRpartPlot(tree)
# Prune the tree: pruned
pruned <- rpart::prune(tree, cp=0.01)
# Draw pruned
rattle::fancyRpartPlot(pruned)
data(spam, package="kernlab")
spam <- spam %>%
mutate(spam=as.integer(type)-1L) %>%
select(-type)
str(spam)
## 'data.frame': 4601 obs. of 58 variables:
## $ make : num 0 0.21 0.06 0 0 0 0 0 0.15 0.06 ...
## $ address : num 0.64 0.28 0 0 0 0 0 0 0 0.12 ...
## $ all : num 0.64 0.5 0.71 0 0 0 0 0 0.46 0.77 ...
## $ num3d : num 0 0 0 0 0 0 0 0 0 0 ...
## $ our : num 0.32 0.14 1.23 0.63 0.63 1.85 1.92 1.88 0.61 0.19 ...
## $ over : num 0 0.28 0.19 0 0 0 0 0 0 0.32 ...
## $ remove : num 0 0.21 0.19 0.31 0.31 0 0 0 0.3 0.38 ...
## $ internet : num 0 0.07 0.12 0.63 0.63 1.85 0 1.88 0 0 ...
## $ order : num 0 0 0.64 0.31 0.31 0 0 0 0.92 0.06 ...
## $ mail : num 0 0.94 0.25 0.63 0.63 0 0.64 0 0.76 0 ...
## $ receive : num 0 0.21 0.38 0.31 0.31 0 0.96 0 0.76 0 ...
## $ will : num 0.64 0.79 0.45 0.31 0.31 0 1.28 0 0.92 0.64 ...
## $ people : num 0 0.65 0.12 0.31 0.31 0 0 0 0 0.25 ...
## $ report : num 0 0.21 0 0 0 0 0 0 0 0 ...
## $ addresses : num 0 0.14 1.75 0 0 0 0 0 0 0.12 ...
## $ free : num 0.32 0.14 0.06 0.31 0.31 0 0.96 0 0 0 ...
## $ business : num 0 0.07 0.06 0 0 0 0 0 0 0 ...
## $ email : num 1.29 0.28 1.03 0 0 0 0.32 0 0.15 0.12 ...
## $ you : num 1.93 3.47 1.36 3.18 3.18 0 3.85 0 1.23 1.67 ...
## $ credit : num 0 0 0.32 0 0 0 0 0 3.53 0.06 ...
## $ your : num 0.96 1.59 0.51 0.31 0.31 0 0.64 0 2 0.71 ...
## $ font : num 0 0 0 0 0 0 0 0 0 0 ...
## $ num000 : num 0 0.43 1.16 0 0 0 0 0 0 0.19 ...
## $ money : num 0 0.43 0.06 0 0 0 0 0 0.15 0 ...
## $ hp : num 0 0 0 0 0 0 0 0 0 0 ...
## $ hpl : num 0 0 0 0 0 0 0 0 0 0 ...
## $ george : num 0 0 0 0 0 0 0 0 0 0 ...
## $ num650 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ lab : num 0 0 0 0 0 0 0 0 0 0 ...
## $ labs : num 0 0 0 0 0 0 0 0 0 0 ...
## $ telnet : num 0 0 0 0 0 0 0 0 0 0 ...
## $ num857 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ data : num 0 0 0 0 0 0 0 0 0.15 0 ...
## $ num415 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ num85 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ technology : num 0 0 0 0 0 0 0 0 0 0 ...
## $ num1999 : num 0 0.07 0 0 0 0 0 0 0 0 ...
## $ parts : num 0 0 0 0 0 0 0 0 0 0 ...
## $ pm : num 0 0 0 0 0 0 0 0 0 0 ...
## $ direct : num 0 0 0.06 0 0 0 0 0 0 0 ...
## $ cs : num 0 0 0 0 0 0 0 0 0 0 ...
## $ meeting : num 0 0 0 0 0 0 0 0 0 0 ...
## $ original : num 0 0 0.12 0 0 0 0 0 0.3 0 ...
## $ project : num 0 0 0 0 0 0 0 0 0 0.06 ...
## $ re : num 0 0 0.06 0 0 0 0 0 0 0 ...
## $ edu : num 0 0 0.06 0 0 0 0 0 0 0 ...
## $ table : num 0 0 0 0 0 0 0 0 0 0 ...
## $ conference : num 0 0 0 0 0 0 0 0 0 0 ...
## $ charSemicolon : num 0 0 0.01 0 0 0 0 0 0 0.04 ...
## $ charRoundbracket : num 0 0.132 0.143 0.137 0.135 0.223 0.054 0.206 0.271 0.03 ...
## $ charSquarebracket: num 0 0 0 0 0 0 0 0 0 0 ...
## $ charExclamation : num 0.778 0.372 0.276 0.137 0.135 0 0.164 0 0.181 0.244 ...
## $ charDollar : num 0 0.18 0.184 0 0 0 0.054 0 0.203 0.081 ...
## $ charHash : num 0 0.048 0.01 0 0 0 0 0 0.022 0 ...
## $ capitalAve : num 3.76 5.11 9.82 3.54 3.54 ...
## $ capitalLong : num 61 101 485 40 40 15 4 11 445 43 ...
## $ capitalTotal : num 278 1028 2259 191 191 ...
## $ spam : int 1 1 1 1 1 1 1 1 1 1 ...
idxTrain <- sample(x=1:nrow(spam), size=round(.7*nrow(spam)), replace=FALSE)
train <- spam[idxTrain, ]
test <- spam[-idxTrain, ]
dim(train); dim(test)
## [1] 3221 58
## [1] 1380 58
# Train and test tree with gini criterion
tree_g <- rpart::rpart(spam ~ ., train, method = "class")
pred_g <- predict(tree_g, test, type = "class")
conf_g <- table(test$spam, pred_g)
acc_g <- sum(diag(conf_g)) / sum(conf_g)
# Change the first line of code to use information gain as splitting criterion
tree_i <- rpart::rpart(spam ~ ., train, method = "class", parms = list(split = "information"))
pred_i <- predict(tree_i, test, type = "class")
conf_i <- table(test$spam, pred_i)
acc_i <- sum(diag(conf_i)) / sum(conf_i)
# Draw a fancy plot of both tree_g and tree_i
rattle::fancyRpartPlot(tree_g)
rattle::fancyRpartPlot(tree_i)
# Print out acc_g and acc_i
acc_g
## [1] 0.8869565
acc_i
## [1] 0.8971014
# Shuffle the dataset, call the result shuffled
titanic <- titanic_train %>%
select(Survived, Pclass, Sex, Age) %>%
mutate(Survived=factor(Survived, levels=c(1, 0)), Sex=as.integer(factor(Sex))-1L) %>%
na.omit()
n <- nrow(titanic)
shuffled <- titanic[sample(n),]
# Split the data in train and test
train_indices <- 1:round(0.7*n)
train <- shuffled[train_indices, ]
test <- shuffled[-train_indices, ]
# Print the structure of train and test
str(train)
## 'data.frame': 500 obs. of 4 variables:
## $ Survived: Factor w/ 2 levels "1","0": 2 1 1 1 2 2 1 1 1 2 ...
## $ Pclass : int 3 1 1 1 3 3 2 2 1 3 ...
## $ Sex : int 0 1 0 1 1 1 0 1 1 1 ...
## $ Age : num 2 26 43 27 19 36 41 19 25 28 ...
## - attr(*, "na.action")=Class 'omit' Named int [1:177] 6 18 20 27 29 30 32 33 37 43 ...
## .. ..- attr(*, "names")= chr [1:177] "6" "18" "20" "27" ...
str(test)
## 'data.frame': 214 obs. of 4 variables:
## $ Survived: Factor w/ 2 levels "1","0": 1 2 2 2 2 1 1 2 2 2 ...
## $ Pclass : int 2 3 2 3 2 2 1 3 3 2 ...
## $ Sex : int 0 1 1 1 1 0 0 1 1 1 ...
## $ Age : num 28 17 34 28 29 36 19 22 25 30 ...
## - attr(*, "na.action")=Class 'omit' Named int [1:177] 6 18 20 27 29 30 32 33 37 43 ...
## .. ..- attr(*, "names")= chr [1:177] "6" "18" "20" "27" ...
# Store the Survived column of train and test in train_labels and test_labels
train_labels <- train$Survived
test_labels <- test$Survived
# Copy train and test to knn_train and knn_test
knn_train <- train
knn_test <- test
# Drop Survived column for knn_train and knn_test
knn_train$Survived <- NULL
knn_test$Survived <- NULL
# Normalize Pclass
min_class <- min(knn_train$Pclass)
max_class <- max(knn_train$Pclass)
knn_train$Pclass <- (knn_train$Pclass - min_class) / (max_class - min_class)
knn_test$Pclass <- (knn_test$Pclass - min_class) / (max_class - min_class)
# Normalize Age
min_age <- min(knn_train$Age)
max_age <- max(knn_train$Age)
knn_train$Age <- (knn_train$Age - min_age) / (max_age - min_age)
knn_test$Age <- (knn_test$Age - min_age) / (max_age - min_age)
# Fill in the ___, make predictions using knn: pred
pred <- class::knn(train = knn_train, test = knn_test, cl = train_labels, k = 5)
# Construct the confusion matrix: conf
(conf <- table(test_labels, pred))
## pred
## test_labels 1 0
## 1 60 28
## 0 12 114
range <- 1:round(0.2 * nrow(knn_train))
accs <- rep(0, length(range))
for (k in range) {
# Fill in the ___, make predictions using knn: pred
pred <- class::knn(knn_train, knn_test, cl=train_labels, k = k)
# Fill in the ___, construct the confusion matrix: conf
conf <- table(test_labels, pred)
# Fill in the ___, calculate the accuracy and store it in accs[k]
accs[k] <- sum(diag(conf)) / sum(conf)
}
# Plot the accuracies. Title of x-axis is "k".
plot(range, accs, xlab = "k")
# Calculate the best k
which.max(accs)
## [1] 3
# CAUTION - DO NOT HAVE THIS DATA, though UCIMLR (Census + Income) is the SOURCE
# test should be 9215 x 14 while train should be 21503 x 14
# income is the key variable, with 1 meaning > $50,000 while 0 meaning otherwise
# Build a tree on the training set: tree
# tree <- rpart::rpart(income ~ ., train, method = "class")
# Predict probability values using the model: all_probs
# all_probs <- predict(tree, newdata=test, type="prob")
# Print out all_probs
# str(all_probs)
# Select second column of all_probs: probs
# probs <- all_probs[, 2]
# Make a prediction object: pred
# pred <- ROCR::prediction(probs, test$income)
# Make a performance object: perf
# perf <- ROCR::performance(pred, "tpr", "fpr")
# Plot this curve
# plot(perf)
# Make a performance object: perf
# perf <- ROCR::performance(pred, "auc")
# Print out the AUC
# perf@y.values[[1]]
# EVEN MORE DATA THAT I DO NOT HAVE
draw_roc_lines <- function(tree, knn) {
if (!(class(tree)== "performance" && class(knn) == "performance") ||
!(attr(class(tree),"package") == "ROCR" && attr(class(knn),"package") == "ROCR")) {
stop("This predefined function needs two performance objects as arguments.")
} else if (length(tree@x.values) == 0 | length(knn@x.values) == 0) {
stop('This predefined function needs the right kind of performance objects as arguments. Are you sure you are creating both objects with arguments "tpr" and "fpr"?')
} else {
plot(0,0,
type = "n",
main = "ROC Curves",
ylab = "True positive rate",
xlab = "False positive rate",
ylim = c(0,1),
xlim = c(0,1))
lines(tree@x.values[[1]], tree@y.values[[1]], type = "l", lwd = 2, col = "red")
lines(knn@x.values[[1]], knn@y.values[[1]], type = "l", lwd = 2, col = "green")
legend("bottomright", c("DT","KNN"), lty=c(1,1),lwd=c(2.5,2.5),col=c("red","green"))
}
}
# Make the prediction objects for both models: pred_t, pred_k
# pred_t <- ROCR::prediction(probs_t, test$spam)
# pred_k <- ROCR::prediction(probs_k, test$spam)
# Make the performance objects for both models: perf_t, perf_k
# perf_t <- ROCR::performance(pred_t, "tpr", "fpr")
# perf_k <- ROCR::performance(pred_k, "tpr", "fpr")
# Draw the ROC lines using draw_roc_lines()
# draw_roc_lines(perf_t, perf_k)
Chapter 4 - Regression
Simple, Linear Regression - estimated an actual value rather than the class of an observation:
Multivariable Linear Regression - combining several predictors all in a single model:
k-Nearest-Neighbors and Generalization - solution to problem of not knowing what transformations to use:
Example code includes:
kang_nose <- data.frame(nose_width=c( 241, 222, 233, 207, 247, 189, 226, 240, 215, 231, 263, 220, 271, 284, 279, 272, 268, 278, 238, 255, 308, 281, 288, 306, 236, 204, 216, 225, 220, 219, 201, 213, 228, 234, 237, 217, 211, 238, 221, 281, 292, 251, 231, 275, 275 ) ,
nose_length=c( 609, 629, 620, 564, 645, 493, 606, 660, 630, 672, 778, 616, 727, 810, 778, 823, 755, 710, 701, 803, 855, 838, 830, 864, 635, 565, 562, 580, 596, 597, 636, 559, 615, 740, 677, 675, 629, 692, 710, 730, 763, 686, 717, 737, 816 )
)
str(kang_nose)
## 'data.frame': 45 obs. of 2 variables:
## $ nose_width : num 241 222 233 207 247 189 226 240 215 231 ...
## $ nose_length: num 609 629 620 564 645 493 606 660 630 672 ...
nose_width_new <- data.frame(nose_width=250)
# Plot nose length as function of nose width.
plot(kang_nose, xlab = "nose width", ylab = "nose length")
# Fill in the ___, describe the linear relationship between the two variables: lm_kang
lm_kang <- lm(nose_length ~ nose_width, data = kang_nose)
# Print the coefficients of lm_kang
lm_kang$coefficients
## (Intercept) nose_width
## 27.893058 2.701175
# Predict and print the nose length of the escaped kangoroo
predict(lm_kang, newdata=nose_width_new)
## 1
## 703.1869
# Build model and make plot
lm_kang <- lm(nose_length ~ nose_width, data=kang_nose)
plot(kang_nose, xlab = "nose width", ylab = "nose length")
abline(lm_kang$coefficients, col = "red")
# Apply predict() to lm_kang: nose_length_est
nose_length_est <- predict(lm_kang)
# Calculate difference between the predicted and the true values: res
res <- (kang_nose$nose_length - nose_length_est)
# Calculate RMSE, assign it to rmse and print it
(rmse <- sqrt( mean( res^2 ) ))
## [1] 43.26288
# Calculate the residual sum of squares: ss_res
ss_res <- sum(res^2)
# Determine the total sum of squares: ss_tot
ss_tot <- sum( (kang_nose$nose_length - mean(kang_nose$nose_length))^2 )
# Calculate R-squared and assign it to r_sq. Also print it.
(r_sq <- 1 - ss_res / ss_tot)
## [1] 0.7768914
# Apply summary() to lm_kang
summary(lm_kang)
##
## Call:
## lm(formula = nose_length ~ nose_width, data = kang_nose)
##
## Residuals:
## Min 1Q Median 3Q Max
## -69.876 -32.912 -4.855 30.227 86.307
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 27.8931 54.2991 0.514 0.61
## nose_width 2.7012 0.2207 12.236 1.34e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 44.26 on 43 degrees of freedom
## Multiple R-squared: 0.7769, Adjusted R-squared: 0.7717
## F-statistic: 149.7 on 1 and 43 DF, p-value: 1.342e-15
cgdp <- c( 666.3, 5935.7, 4619.2, 7574.3, 3646.7, 13961.2, 51127.1, 7884.2, 295.1, 47516.5, 825.2, 720, 1096.6, 7712.8, 22245.5, 4796.2, 8040, 11612.5, 15199.3, 40776.3, 7757, 378.6, 7593.9, 1426.4, 7720, 860.8, 3715.3, 10035.4, 27194.4, 47627.4, 7433.9, 60634.4, 6222.5, 6850.3, 39567.9, 590.2, 30262.2, 567.8, 36317.8, 1555, 49541.3, 4543.3, 1461.6, 550, 422.8, 585.6, 21682.6, 8299.1, 3703, 37896.8, 2346.7, 13507.4, 13902.7, 3514.6, 53313.6, 6432.8, 52111, 34960.3, 36194.4, 1269.1, 1084.4, 1604.4, 15209.9, 27970.5, 9127.3, 1707.5, 10139.2, 6575.4, 7437, 10125.6, 944.4, 648.1, 3631, 2032.8, 4301.1, 995.5, 16037.8, 3140, 2233.8, 449.4, 8624.8, 8518.7, 10361.3, 4731.6, 5370.7, 765.7, 1197.5, 4333.3, 7370.9, 4170.2, 1270.2, 10005.6, 253, 54198.7, 440.7, 3184.6, 1913.6, 97363.1, 698.3, 38400.1, 4749, 1333.5, 11770.9, 6594.4, 2843.1, 11879.7, 14422.8, 22080.9, 4479.1, 3575.2, 93397.1, 9996.7, 12735.9, 652.1, 1541.1, 25409, 1904.2, 1070.9, 2021.7, 3950.7, 6152.9, 1781.1, 1113.4, 1692.4, 18416.5, 23962.6, 58887.3, 2682.3, 15359.2, 1053.8, 646.1, 9031.5, 1280.4, 4106.4, 998.1, 677.4, 3082.5, 7986.9, 16810.9, 6477.9, 475.2, 1801.9 )
urb_pop <- c( 26.3, 43.3, 56.4, 57.6, 62.8, 24.2, 65.9, 54.4, 11.8, 97.8, 43.5, 29, 33.5, 73.6, 82.8, 39.6, 76.3, 85.4, 31.6, 76.9, 57.2, 39.8, 54.4, 53.8, 76.2, 28.2, 64.8, 75.9, 67, 75.1, 69.3, 87.5, 51.9, 59.9, 75.7, 22.2, 79.4, 19, 74.6, 40.7, 84.1, 53.4, 53.4, 36.7, 59, 48.5, 77.7, 35.6, 51.1, 80.6, 54.1, 58.7, 70.8, 53, 63, 69.4, 94, 68.8, 93, 35.6, 20.5, 44.2, 32, 82.4, 77.7, 37.6, 87.7, 78.4, 18.5, 79.5, 30.9, 29.6, 18.3, 38.6, 47, 26.8, 67.4, 59.7, 44.9, 34.5, 44.5, 64.1, 79, 49.1, 57, 39.1, 33.6, 60.4, 63.8, 71.2, 59.3, 39.8, 16.1, 81.5, 18.5, 46.9, 58.5, 80.2, 18.2, 80, 48.6, 38.3, 66.3, 78.3, 44.5, 86.5, 60.6, 62.9, 59.4, 37.2, 99.2, 54.4, 73.9, 27.8, 32.6, 82.9, 33.6, 43.4, 21.9, 66.3, 55.5, 37.2, 18.6, 64.5, 53.8, 49.7, 85.7, 21.3, 53.6, 22.3, 39.5, 49.7, 32.1, 23.6, 30.9, 15.8, 69.5, 61.8, 95.2, 64.3, 42, 40.5 )
world_bank_train <- data.frame(urb_pop=urb_pop, cgdp=cgdp)
str(world_bank_train)
## 'data.frame': 142 obs. of 2 variables:
## $ urb_pop: num 26.3 43.3 56.4 57.6 62.8 24.2 65.9 54.4 11.8 97.8 ...
## $ cgdp : num 666 5936 4619 7574 3647 ...
cgdp_afg <- data.frame(cgdp=413)
# Plot urb_pop as function of cgdp
with(world_bank_train, plot(y=urb_pop, x=cgdp))
# Set up a linear model between the two variables: lm_wb
lm_wb <- lm(urb_pop ~ cgdp, data=world_bank_train)
# Add a red regression line to your scatter plot
abline(lm_wb$coefficients, col="red")
# Summarize lm_wb and select R-squared
summary(lm_wb)$r.squared
## [1] 0.3822347
# Predict the urban population of afghanistan based on cgdp_afg
predict(lm_wb, newdata=cgdp_afg)
## 1
## 45.0156
# Plot: change the formula and xlab
plot(urb_pop ~ log(cgdp), data = world_bank_train,
xlab = "log(GDP per Capita)",
ylab = "Percentage of urban population")
# Linear model: change the formula
lm_wb <- lm(urb_pop ~ log(cgdp), data = world_bank_train)
# Add a red regression line to your scatter plot
abline(lm_wb$coefficients, col = "red")
# Summarize lm_wb and select R-squared
summary(lm_wb)$r.squared
## [1] 0.5788284
# Predict the urban population of afghanistan based on cgdp_afg
predict(lm_wb, newdata=cgdp_afg)
## 1
## 25.86829
sales <- c( 231, 156, 10, 519, 437, 487, 299, 195, 20, 68, 570, 428, 464, 15, 65, 98, 398, 161, 397, 497, 528, 99, 0.5, 347, 341, 507, 400 )
sq_ft <- c( 3, 2.2, 0.5, 5.5, 4.4, 4.8, 3.1, 2.5, 1.2, 0.6, 5.4, 4.2, 4.7, 0.6, 1.2, 1.6, 4.3, 2.6, 3.8, 5.3, 5.6, 0.8, 1.1, 3.6, 3.5, 5.1, 8.6 )
inv <- c( 294, 232, 149, 600, 567, 571, 512, 347, 212, 102, 788, 577, 535, 163, 168, 151, 342, 196, 453, 518, 615, 278, 142, 461, 382, 590, 517 )
ads <- c( 8.2, 6.9, 3, 12, 10.6, 11.8, 8.1, 7.7, 3.3, 4.9, 17.4, 10.5, 11.3, 2.5, 4.7, 4.6, 5.5, 7.2, 10.4, 11.5, 12.3, 2.8, 3.1, 9.6, 9.8, 12, 7 )
size_dist <- c( 8.2, 4.1, 4.3, 16.1, 14.1, 12.7, 10.1, 8.4, 2.1, 4.7, 12.3, 14, 15, 2.5, 3.3, 2.7, 16, 6.3, 13.9, 16.3, 16, 6.5, 1.6, 11.3, 11.5, 15.7, 12 )
comp <- c( 11, 12, 15, 1, 5, 4, 10, 12, 15, 8, 1, 7, 3, 14, 11, 10, 4, 13, 7, 1, 0, 14, 12, 6, 5, 0, 8 )
shop_data <- data.frame(sales=sales, sq_ft=sq_ft, inv=inv, ads=ads,
size_dist=size_dist, comp=comp
)
str(shop_data)
## 'data.frame': 27 obs. of 6 variables:
## $ sales : num 231 156 10 519 437 487 299 195 20 68 ...
## $ sq_ft : num 3 2.2 0.5 5.5 4.4 4.8 3.1 2.5 1.2 0.6 ...
## $ inv : num 294 232 149 600 567 571 512 347 212 102 ...
## $ ads : num 8.2 6.9 3 12 10.6 11.8 8.1 7.7 3.3 4.9 ...
## $ size_dist: num 8.2 4.1 4.3 16.1 14.1 12.7 10.1 8.4 2.1 4.7 ...
## $ comp : num 11 12 15 1 5 4 10 12 15 8 ...
shop_new <- data.frame(sq_ft=2.3, inv=420, ads=8.7, size_dist=9.1, comp=10)
# Add a plot: sales as a function of inventory. Is linearity plausible?
plot(sales ~ sq_ft, shop_data)
plot(sales ~ size_dist, shop_data)
plot(sales ~ inv, shop_data)
# Build a linear model for net sales based on all other variables: lm_shop
lm_shop <- lm(sales ~ ., data=shop_data)
# Summarize lm_shop
summary(lm_shop)
##
## Call:
## lm(formula = sales ~ ., data = shop_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26.338 -9.699 -4.496 4.040 41.139
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -18.85941 30.15023 -0.626 0.538372
## sq_ft 16.20157 3.54444 4.571 0.000166 ***
## inv 0.17464 0.05761 3.032 0.006347 **
## ads 11.52627 2.53210 4.552 0.000174 ***
## size_dist 13.58031 1.77046 7.671 1.61e-07 ***
## comp -5.31097 1.70543 -3.114 0.005249 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.65 on 21 degrees of freedom
## Multiple R-squared: 0.9932, Adjusted R-squared: 0.9916
## F-statistic: 611.6 on 5 and 21 DF, p-value: < 2.2e-16
# Plot the residuals in function of your fitted observations
plot(x=lm_shop$fitted.values, y=lm_shop$residuals)
# Make a Q-Q plot of your residual quantiles
qqnorm(lm_shop$residuals, ylab="Residual Quantiles")
# Summarize your model, are there any irrelevant predictors?
summary(lm_shop)
##
## Call:
## lm(formula = sales ~ ., data = shop_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26.338 -9.699 -4.496 4.040 41.139
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -18.85941 30.15023 -0.626 0.538372
## sq_ft 16.20157 3.54444 4.571 0.000166 ***
## inv 0.17464 0.05761 3.032 0.006347 **
## ads 11.52627 2.53210 4.552 0.000174 ***
## size_dist 13.58031 1.77046 7.671 1.61e-07 ***
## comp -5.31097 1.70543 -3.114 0.005249 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.65 on 21 degrees of freedom
## Multiple R-squared: 0.9932, Adjusted R-squared: 0.9916
## F-statistic: 611.6 on 5 and 21 DF, p-value: < 2.2e-16
# Predict the net sales based on shop_new.
predict(lm_shop, newdata=shop_new)
## 1
## 262.5006
choco_data <- data.frame(
energy=c( 1970, 2003, 2057, 1920, 2250, 2186, 1930, 1980, 1890, 2030, 2180, 1623, 1640, 2210, 1980, 1970, 1877.4, 2021.4, 1840.1, 2272.1, 2047.3, 1843, 2075.2, 2119.8, 2090.9, 1934.3, 2257.3, 2057.9, 1878.2, 1595.3, 2188.3, 1980.4, 1985.9, 2156.5, 2134.6, 2094.2, 2151.7, 2127.7, 2001.9, 1635.2, 2098.9, 1978.6, 1961.2, 1727.2, 1903.7, 2062.6, 2230.1, 1970.5, 2057.4, 1979.2, 1744.1, 1914.9, 1918.7, 1978.1, 2184, 2124.4 ),
protein=c( 3.1, 4.6, 9.9, 5.1, 10.2, 7, 3.5, 7.2, 4.7, 5.6, 5.5, 2.2, 3.7, 8.2, 8.5, 5, 6.1, 4.6, 3.4, 10.5, 5.9, 3.2, 5.6, 7.5, 7.3, 5.4, 8.9, 6, 2.8, 3.4, 5.5, 7, 7.7, 8.9, 9.4, 7.5, 10.4, 5.6, 9.1, 2.9, 9.1, 4.7, 2.2, 2.3, 6.3, 6.7, 8.3, 6.3, 5.3, 7.8, 5.8, 7, 4.3, 6.9, 8.9, 5 ),
fat=c( 27.2, 26.5, 23, 18.4, 30.1, 28.4, 24.5, 22.9, 19.5, 20.4, 26.8, 9.2, 12, 29.8, 20.6, 20, 18, 22.3, 20.8, 27.7, 25.7, 18.3, 27.6, 25.8, 26.9, 21.6, 29.4, 27.8, 21.4, 12.9, 32.1, 24.4, 19.6, 26.6, 24.5, 24.6, 27.2, 26.1, 21.8, 12.2, 25, 26.7, 22, 16.5, 21.5, 29.6, 28.1, 20.8, 28.1, 21.2, 15.4, 19.9, 18.9, 21.9, 30.5, 25.1 ),
size=c( 50, 50, 40, 80, 45, 78, 55, 60, 60, 50, 40, 55, 44.5, 75, 60, 42.5, 52.3, 52.3, 63.1, 64.8, 46.9, 45, 60.7, 66.3, 54.7, 66.2, 62.6, 48, 58.8, 37.5, 75.4, 80.8, 50.6, 43.3, 63.9, 54.4, 87.6, 55.9, 64.3, 52.8, 46.7, 57.7, 31.8, 72, 56.6, 83.9, 63.4, 46, 63.7, 43.2, 37.2, 58.5, 49, 55.2, 57.9, 48.8 )
)
str(choco_data)
## 'data.frame': 56 obs. of 4 variables:
## $ energy : num 1970 2003 2057 1920 2250 ...
## $ protein: num 3.1 4.6 9.9 5.1 10.2 7 3.5 7.2 4.7 5.6 ...
## $ fat : num 27.2 26.5 23 18.4 30.1 28.4 24.5 22.9 19.5 20.4 ...
## $ size : num 50 50 40 80 45 78 55 60 60 50 ...
# Add a plot: energy/100g as function of total size. Linearity plausible?
plot(energy ~ protein, choco_data)
plot(energy ~ fat, choco_data)
plot(energy ~ size, choco_data)
# Build a linear model for the energy based on all other variables: lm_choco
lm_choco <- lm(energy ~ ., data=choco_data)
# Plot the residuals in function of your fitted observations
plot(x=lm_choco$fitted.values, y=lm_choco$residuals)
# Make a Q-Q plot of your residual quantiles
qqnorm(lm_choco$residuals)
# Summarize lm_choco
summary(lm_choco)
##
## Call:
## lm(formula = energy ~ ., data = choco_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -106.680 -36.071 -9.062 36.079 104.361
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1339.2806 40.0195 33.466 < 2e-16 ***
## protein 23.0122 3.6565 6.293 6.6e-08 ***
## fat 24.4416 1.6839 14.515 < 2e-16 ***
## size -0.8224 0.6026 -1.365 0.178
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 52.14 on 52 degrees of freedom
## Multiple R-squared: 0.9021, Adjusted R-squared: 0.8965
## F-statistic: 159.8 on 3 and 52 DF, p-value: < 2.2e-16
world_bank_test <- data.frame(
cgdp=c( 18389.4, 1099, 2379.2, 5823.3, 3670, 788.4, 1646.4, 19553.9, 1630.8, 61887, 2965.9, 3436.3, 12276.4, 3150.5, 42736.2, 16529.7, 10067.5, 25592.4, 50271.1, 5422.6, 6290.8, 20832, 10803.5, 935.9, 37031.7, 5292.9, 45603.3, 42522, 56286.8, 14520, 5361.1, 6662.6, 4017, 2037.7, 6075.5, 1784.4, 96443.7, 40169.6, 19719.8, 1796, 619, 10829.9, 16444.8, 14091.4, 54629.5, 5560.7, 43619.1, 19199.3, 832.9, 9463.1, 25198.1, 461, 5719.6, 3100.8, 10542.8, 12922.4, 1337.9, 51590, 914.7, 2052.3, 4173.4 ),
urb_pop=c( 39.8, 26.7, 37.9, 46.2, 53.5, 39.6, 53.5, 73, 32.4, 89.3, 75, 43.1, 53.3, 68.1, 79.3, 88.9, 86.9, 70.7, 81.7, 83.4, 63.5, 77.2, 53.5, 32.5, 92.1, 72.9, 82.3, 85.3, 100, 89.4, 70.1, 50.2, 28.5, 36.3, 78.1, 77.3, 100, 100, 67.6, 37.2, 31.9, 74, 66.5, 62.3, 81.4, 49.2, 80.7, 80.5, 57.4, 55.7, 88.7, 49.3, 45.7, 65, 72.9, 91.6, 25.2, 89.9, 34, 33, 19.3 )
)
str(world_bank_test)
## 'data.frame': 61 obs. of 2 variables:
## $ cgdp : num 18389 1099 2379 5823 3670 ...
## $ urb_pop: num 39.8 26.7 37.9 46.2 53.5 39.6 53.5 73 32.4 89.3 ...
# Build the log-linear model
lm_wb_log <- lm(urb_pop ~ log(cgdp), data = world_bank_train)
# Calculate rmse_train
rmse_train <- sqrt(mean(lm_wb_log$residuals ^ 2))
# The real percentage of urban population in the test set, the ground truth
world_bank_test_truth <- world_bank_test$urb_pop
# The predictions of the percentage of urban population in the test set
world_bank_test_input <- data.frame(cgdp = world_bank_test$cgdp)
world_bank_test_output <- predict(lm_wb_log, world_bank_test_input)
# The residuals: the difference between the ground truth and the predictions
res_test <- world_bank_test_output - world_bank_test_truth
# Use res_test to calculate rmse_test
rmse_test <- sqrt(mean(res_test^2))
# Print the ratio of the test RMSE over the training RMSE
rmse_test / rmse_train
## [1] 1.082428
my_knn <- function(x_pred, x, y, k){
m <- length(x_pred)
predict_knn <- rep(0, m)
for (i in 1:m) {
# Calculate the absolute distance between x_pred[i] and x
dist <- abs(x_pred[i] - x)
# Apply order() to dist, sort_index will contain
# the indices of elements in the dist vector, in
# ascending order. This means sort_index[1:k] will
# return the indices of the k-nearest neighbors.
sort_index <- order(dist)
# Apply mean() to the responses of the k-nearest neighbors
predict_knn[i] <- mean(y[sort_index[1:k]])
}
return(predict_knn)
}
# Apply your algorithm on the test set: test_output
test_output <- my_knn(x_pred=world_bank_test$cgdp, x=world_bank_train$cgdp,
y=world_bank_train$urb_pop, k=30
)
# Have a look at the plot of the output
plot(world_bank_train[,2:1],
xlab = "GDP per Capita",
ylab = "Percentage Urban Population")
points(world_bank_test$cgdp, test_output, col = "green")
# Set up a linear model between the two variables: lm_wb
lm_wb <- lm(urb_pop ~ cgdp, data=world_bank_train)
# Set up a linear model between the two variables: lm_wb
lm_wb_log <- lm(urb_pop ~ log(cgdp), data=world_bank_train)
# Define ranks to order the predictor variables in the test set
ranks <- order(world_bank_test$cgdp)
# Scatter plot of test set
plot(world_bank_test,
xlab = "GDP per Capita", ylab = "Percentage Urban Population")
# Predict with simple linear model and add line
test_output_lm <- predict(lm_wb, data.frame(cgdp = world_bank_test$cgdp))
lines(world_bank_test$cgdp[ranks], test_output_lm[ranks], lwd = 2, col = "blue")
# Predict with log-linear model and add line
test_output_lm_log <- predict(lm_wb_log, data.frame(cgdp = world_bank_test$cgdp))
lines(world_bank_test$cgdp[ranks], test_output_lm_log[ranks], lwd = 2, col = "red")
# Predict with k-NN and add line
test_output_knn <- my_knn(x_pred=world_bank_test$cgdp, x=world_bank_train$cgdp,
y=world_bank_train$urb_pop, k=30
)
lines(world_bank_test$cgdp[ranks], test_output_knn[ranks], lwd = 2, col = "green")
# Calculate RMSE on the test set for simple linear model
sqrt(mean( (test_output_lm - world_bank_test$urb_pop) ^ 2))
## [1] 17.41258
# Calculate RMSE on the test set for log-linear model
sqrt(mean( (test_output_lm_log - world_bank_test$urb_pop) ^ 2))
## [1] 15.01008
# Calculate RMSE on the test set for k-NN technique
sqrt(mean( (test_output_knn - world_bank_test$urb_pop) ^ 2))
## [1] 16.0917
Chapter 5 - Clustering
Clustering with k-means (unsupervised learning) - objects that are similar within and dissimilar across:
Performance and scaling issues - since there is no “truth”, the goal is to have compact clusters with low variance within the clusters and high separation between the clusters:
Hierarchical Clustering - addressing questions such as “which objects cluster first” and “which cluster pairs merge, and when”:
Example code includes:
seeds <- data.frame(area=c( 15.26, 14.88, 14.29, 13.84, 16.14, 14.38, 14.69, 14.11, 16.63, 16.44, 15.26, 14.03, 13.89, 13.78, 13.74, 14.59, 13.99, 15.69, 14.7, 12.72, 14.16, 14.11, 15.88, 12.08, 15.01, 16.19, 13.02, 12.74, 14.11, 13.45, 13.16, 15.49, 14.09, 13.94, 15.05, 16.12, 16.2, 17.08, 14.8, 14.28, 13.54, 13.5, 13.16, 15.5, 15.11, 13.8, 15.36, 14.99, 14.79, 14.86, 14.43, 15.78, 14.49, 14.33, 14.52, 15.03, 14.46, 14.92, 15.38, 12.11, 11.42, 11.23, 12.36, 13.22, 12.78, 12.88, 14.34, 14.01, 14.37, 12.73, 17.63, 16.84, 17.26, 19.11, 16.82, 16.77, 17.32, 20.71, 18.94, 17.12, 16.53, 18.72, 20.2, 19.57, 19.51, 18.27, 18.88, 18.98, 21.18, 20.88, 20.1, 18.76, 18.81, 18.59, 18.36, 16.87, 19.31, 18.98, 18.17, 18.72, 16.41, 17.99, 19.46, 19.18, 18.95, 18.83, 18.85, 17.63, 19.94, 18.55, 18.45, 19.38, 19.13, 19.14, 20.97, 19.06, 18.96, 19.15, 18.89, 20.03, 20.24, 18.14, 16.17, 18.43, 15.99, 18.75, 18.65, 17.98, 20.16, 17.55, 18.3, 18.94, 15.38, 16.16, 15.56, 15.38, 17.36, 15.57, 15.6, 16.23, 13.07, 13.32, 13.34, 12.22, 11.82, 11.21, 11.43, 12.49, 12.7, 10.79, 11.83, 12.01, 12.26, 11.18, 11.36, 11.19, 11.34, 12.13, 11.75, 11.49, 12.54, 12.02, 12.05, 12.55, 11.14, 12.1, 12.44, 12.15, 11.35, 11.24, 11.02, 11.55, 11.27, 11.4, 10.83, 10.8, 11.26, 10.74, 11.48, 12.21, 11.41, 12.46, 12.19, 11.65, 12.89, 11.56, 11.81, 10.91, 11.23, 10.59, 10.93, 11.27, 11.87, 10.82, 12.11, 12.8, 12.79, 13.37, 12.62, 12.76, 12.38, 12.67, 11.18, 12.7, 12.37, 12.19, 11.23, 13.2, 11.84, 12.3 ))
seeds$perimeter <- c( 14.84, 14.57, 14.09, 13.94, 14.99, 14.21, 14.49, 14.1, 15.46, 15.25, 14.85, 14.16, 14.02, 14.06, 14.05, 14.28, 13.83, 14.75, 14.21, 13.57, 14.4, 14.26, 14.9, 13.23, 14.76, 15.16, 13.76, 13.67, 14.18, 14.02, 13.82, 14.94, 14.41, 14.17, 14.68, 15, 15.27, 15.38, 14.52, 14.17, 13.85, 13.85, 13.55, 14.86, 14.54, 14.04, 14.76, 14.56, 14.52, 14.67, 14.4, 14.91, 14.61, 14.28, 14.6, 14.77, 14.35, 14.43, 14.77, 13.47, 12.86, 12.63, 13.19, 13.84, 13.57, 13.5, 14.37, 14.29, 14.39, 13.75, 15.98, 15.67, 15.73, 16.26, 15.51, 15.62, 15.91, 17.23, 16.49, 15.55, 15.34, 16.19, 16.89, 16.74, 16.71, 16.09, 16.26, 16.66, 17.21, 17.05, 16.99, 16.2, 16.29, 16.05, 16.52, 15.65, 16.59, 16.57, 16.26, 16.34, 15.25, 15.86, 16.5, 16.63, 16.42, 16.29, 16.17, 15.86, 16.92, 16.22, 16.12, 16.72, 16.31, 16.61, 17.25, 16.45, 16.2, 16.45, 16.23, 16.9, 16.91, 16.12, 15.38, 15.97, 14.89, 16.18, 16.41, 15.85, 17.03, 15.66, 15.89, 16.32, 14.9, 15.33, 14.89, 14.66, 15.76, 15.15, 15.11, 15.18, 13.92, 13.94, 13.95, 13.32, 13.4, 13.13, 13.13, 13.46, 13.71, 12.93, 13.23, 13.52, 13.6, 13.04, 13.05, 13.05, 12.87, 13.73, 13.52, 13.22, 13.67, 13.33, 13.41, 13.57, 12.79, 13.15, 13.59, 13.45, 13.12, 13, 13, 13.1, 12.97, 13.08, 12.96, 12.57, 13.01, 12.73, 13.05, 13.47, 12.95, 13.41, 13.36, 13.07, 13.77, 13.31, 13.45, 12.8, 12.82, 12.41, 12.8, 12.86, 13.02, 12.83, 13.27, 13.47, 13.53, 13.78, 13.67, 13.38, 13.44, 13.32, 12.72, 13.41, 13.47, 13.2, 12.88, 13.66, 13.21, 13.34 )
seeds$compactness <- c( 0.87, 0.88, 0.9, 0.9, 0.9, 0.9, 0.88, 0.89, 0.87, 0.89, 0.87, 0.88, 0.89, 0.88, 0.87, 0.9, 0.92, 0.91, 0.92, 0.87, 0.86, 0.87, 0.9, 0.87, 0.87, 0.88, 0.86, 0.86, 0.88, 0.86, 0.87, 0.87, 0.85, 0.87, 0.88, 0.9, 0.87, 0.91, 0.88, 0.89, 0.89, 0.89, 0.9, 0.88, 0.9, 0.88, 0.89, 0.89, 0.88, 0.87, 0.88, 0.89, 0.85, 0.88, 0.86, 0.87, 0.88, 0.9, 0.89, 0.84, 0.87, 0.88, 0.89, 0.87, 0.87, 0.89, 0.87, 0.86, 0.87, 0.85, 0.87, 0.86, 0.88, 0.91, 0.88, 0.86, 0.86, 0.88, 0.88, 0.89, 0.88, 0.9, 0.89, 0.88, 0.88, 0.89, 0.9, 0.86, 0.9, 0.9, 0.87, 0.9, 0.89, 0.91, 0.85, 0.86, 0.88, 0.87, 0.86, 0.88, 0.89, 0.9, 0.9, 0.87, 0.88, 0.89, 0.91, 0.88, 0.88, 0.89, 0.89, 0.87, 0.9, 0.87, 0.89, 0.89, 0.91, 0.89, 0.9, 0.88, 0.89, 0.88, 0.86, 0.91, 0.91, 0.9, 0.87, 0.9, 0.87, 0.9, 0.91, 0.89, 0.87, 0.86, 0.88, 0.9, 0.88, 0.85, 0.86, 0.88, 0.85, 0.86, 0.86, 0.87, 0.83, 0.82, 0.83, 0.87, 0.85, 0.81, 0.85, 0.82, 0.83, 0.83, 0.84, 0.83, 0.86, 0.81, 0.81, 0.83, 0.84, 0.85, 0.84, 0.86, 0.86, 0.88, 0.85, 0.84, 0.83, 0.84, 0.82, 0.85, 0.84, 0.84, 0.81, 0.86, 0.84, 0.83, 0.85, 0.85, 0.86, 0.87, 0.86, 0.86, 0.85, 0.82, 0.82, 0.84, 0.86, 0.86, 0.84, 0.86, 0.88, 0.83, 0.86, 0.89, 0.88, 0.88, 0.85, 0.9, 0.86, 0.9, 0.87, 0.89, 0.86, 0.88, 0.85, 0.89, 0.85, 0.87 )
seeds$length <- c( 5.76, 5.55, 5.29, 5.32, 5.66, 5.39, 5.56, 5.42, 6.05, 5.88, 5.71, 5.44, 5.44, 5.48, 5.48, 5.35, 5.12, 5.53, 5.21, 5.23, 5.66, 5.52, 5.62, 5.1, 5.79, 5.83, 5.39, 5.39, 5.54, 5.52, 5.45, 5.76, 5.72, 5.58, 5.71, 5.71, 5.83, 5.83, 5.66, 5.4, 5.35, 5.35, 5.14, 5.88, 5.58, 5.38, 5.7, 5.57, 5.54, 5.68, 5.58, 5.67, 5.71, 5.5, 5.74, 5.7, 5.39, 5.38, 5.66, 5.16, 5.01, 4.9, 5.08, 5.39, 5.26, 5.14, 5.63, 5.61, 5.57, 5.41, 6.19, 6, 5.98, 6.15, 6.02, 5.93, 6.06, 6.58, 6.45, 5.85, 5.88, 6.01, 6.29, 6.38, 6.37, 6.17, 6.08, 6.55, 6.57, 6.45, 6.58, 6.17, 6.27, 6.04, 6.67, 6.14, 6.34, 6.45, 6.27, 6.22, 5.72, 5.89, 6.11, 6.37, 6.25, 6.04, 6.15, 6.03, 6.67, 6.15, 6.11, 6.3, 6.18, 6.26, 6.56, 6.42, 6.05, 6.25, 6.23, 6.49, 6.32, 6.06, 5.76, 5.98, 5.36, 6.11, 6.29, 5.98, 6.51, 5.79, 5.98, 6.14, 5.88, 5.84, 5.78, 5.48, 6.14, 5.92, 5.83, 5.87, 5.47, 5.54, 5.39, 5.22, 5.31, 5.28, 5.18, 5.27, 5.39, 5.32, 5.26, 5.41, 5.41, 5.22, 5.17, 5.25, 5.05, 5.39, 5.44, 5.3, 5.45, 5.35, 5.27, 5.33, 5.01, 5.11, 5.32, 5.42, 5.18, 5.09, 5.33, 5.17, 5.09, 5.14, 5.28, 4.98, 5.19, 5.14, 5.18, 5.36, 5.09, 5.24, 5.24, 5.11, 5.5, 5.36, 5.41, 5.09, 5.09, 4.9, 5.05, 5.09, 5.13, 5.18, 5.24, 5.16, 5.22, 5.32, 5.41, 5.07, 5.22, 4.98, 5.01, 5.18, 5.2, 5.14, 5.14, 5.24, 5.17, 5.24 )
seeds$width <- c( 3.31, 3.33, 3.34, 3.38, 3.56, 3.31, 3.26, 3.3, 3.46, 3.5, 3.24, 3.2, 3.2, 3.16, 3.11, 3.33, 3.38, 3.51, 3.47, 3.05, 3.13, 3.17, 3.51, 2.94, 3.25, 3.42, 3.03, 2.96, 3.22, 3.06, 2.98, 3.37, 3.19, 3.15, 3.33, 3.48, 3.46, 3.68, 3.29, 3.3, 3.16, 3.16, 3.2, 3.4, 3.46, 3.15, 3.39, 3.38, 3.29, 3.26, 3.27, 3.43, 3.11, 3.2, 3.11, 3.21, 3.38, 3.41, 3.42, 3.03, 2.85, 2.88, 3.04, 3.07, 3.03, 3.12, 3.19, 3.16, 3.15, 2.88, 3.56, 3.48, 3.59, 3.93, 3.49, 3.44, 3.4, 3.81, 3.64, 3.57, 3.47, 3.86, 3.86, 3.77, 3.8, 3.65, 3.76, 3.67, 4.03, 4.03, 3.79, 3.8, 3.69, 3.86, 3.48, 3.46, 3.81, 3.55, 3.51, 3.68, 3.52, 3.69, 3.89, 3.68, 3.75, 3.79, 3.81, 3.57, 3.76, 3.67, 3.77, 3.79, 3.9, 3.74, 3.99, 3.72, 3.9, 3.82, 3.77, 3.86, 3.96, 3.56, 3.39, 3.77, 3.58, 3.87, 3.59, 3.69, 3.77, 3.69, 3.75, 3.83, 3.27, 3.4, 3.41, 3.46, 3.57, 3.23, 3.29, 3.47, 2.99, 3.07, 3.07, 2.97, 2.78, 2.69, 2.72, 2.97, 2.91, 2.65, 2.84, 2.78, 2.83, 2.69, 2.75, 2.67, 2.85, 2.75, 2.68, 2.69, 2.88, 2.81, 2.85, 2.97, 2.79, 2.94, 2.9, 2.84, 2.67, 2.71, 2.7, 2.85, 2.76, 2.76, 2.64, 2.82, 2.71, 2.64, 2.76, 2.89, 2.77, 3.02, 2.91, 2.85, 3.03, 2.68, 2.72, 2.67, 2.82, 2.79, 2.72, 2.8, 2.95, 2.63, 2.98, 3.13, 3.05, 3.13, 2.91, 3.15, 2.99, 3.13, 2.81, 3.09, 2.96, 2.98, 2.8, 3.23, 2.84, 2.97 )
seeds$asymmetry <- c( 2.22, 1.02, 2.7, 2.26, 1.36, 2.46, 3.59, 2.7, 2.04, 1.97, 4.54, 1.72, 3.99, 3.14, 2.93, 4.18, 5.23, 1.6, 1.77, 4.1, 3.07, 2.69, 0.77, 1.42, 1.79, 0.9, 3.37, 2.5, 2.75, 3.53, 0.86, 3.41, 3.92, 2.12, 2.13, 2.27, 2.82, 2.96, 3.11, 6.68, 2.59, 2.25, 2.46, 4.71, 3.13, 1.56, 1.37, 2.96, 2.7, 2.13, 3.98, 5.59, 4.12, 3.33, 1.48, 1.93, 2.8, 1.14, 2, 1.5, 2.7, 2.27, 3.22, 4.16, 1.18, 2.35, 1.31, 2.22, 1.46, 3.53, 4.08, 4.67, 4.54, 2.94, 4, 4.92, 3.82, 4.45, 5.06, 2.86, 5.53, 5.32, 5.17, 1.47, 2.96, 2.44, 1.65, 3.69, 5.78, 5.02, 1.96, 3.12, 3.24, 6, 4.93, 3.7, 3.48, 2.14, 2.85, 2.19, 4.22, 2.07, 4.31, 3.36, 3.37, 2.55, 2.84, 3.75, 3.25, 1.74, 2.23, 3.68, 2.11, 6.68, 4.68, 2.25, 4.33, 3.08, 3.64, 3.06, 5.9, 3.62, 4.29, 2.98, 3.34, 4.19, 4.39, 2.26, 1.91, 5.37, 2.84, 2.91, 4.46, 4.27, 4.97, 3.6, 3.53, 2.64, 2.73, 3.77, 5.3, 7.04, 6, 5.47, 4.47, 6.17, 2.22, 4.42, 3.26, 5.46, 5.2, 6.99, 4.76, 3.33, 4.05, 5.81, 3.35, 4.83, 4.38, 5.39, 3.08, 4.27, 4.99, 4.42, 6.39, 2.2, 4.92, 3.64, 4.34, 3.52, 6.74, 6.71, 4.31, 5.59, 5.18, 4.77, 5.34, 4.7, 5.88, 1.66, 4.96, 4.99, 4.86, 5.21, 6.18, 4.06, 4.9, 4.18, 7.52, 4.97, 5.4, 3.98, 3.6, 4.85, 4.13, 4.87, 5.48, 4.67, 3.31, 2.83, 5.47, 2.3, 4.05, 8.46, 3.92, 3.63, 4.33, 8.31, 3.6, 5.64 )
seeds$groove_length <- c( 5.22, 4.96, 4.83, 4.8, 5.17, 4.96, 5.22, 5, 5.88, 5.53, 5.31, 5, 4.74, 4.87, 4.83, 4.78, 4.78, 5.05, 4.65, 4.91, 5.18, 5.22, 5.09, 4.96, 5, 5.31, 4.83, 4.87, 5.04, 5.1, 5.06, 5.23, 5.3, 5.01, 5.36, 5.44, 5.53, 5.48, 5.31, 5, 5.18, 5.18, 4.78, 5.53, 5.18, 4.96, 5.13, 5.17, 5.11, 5.35, 5.14, 5.14, 5.4, 5.22, 5.49, 5.44, 5.04, 5.09, 5.22, 4.52, 4.61, 4.7, 4.61, 5.09, 4.78, 4.61, 5.15, 5.13, 5.3, 5.07, 6.06, 5.88, 5.79, 6.08, 5.84, 5.8, 5.92, 6.45, 6.36, 5.75, 5.88, 5.88, 6.19, 6.27, 6.18, 6.2, 6.11, 6.5, 6.23, 6.32, 6.45, 6.05, 6.05, 5.88, 6.45, 5.97, 6.24, 6.45, 6.27, 6.1, 5.62, 5.84, 6.01, 6.23, 6.15, 5.88, 6.2, 5.93, 6.55, 5.89, 5.79, 5.96, 5.92, 6.05, 6.32, 6.16, 5.75, 6.18, 5.97, 6.32, 6.19, 6.01, 5.7, 5.91, 5.14, 5.99, 6.1, 5.92, 6.18, 5.66, 5.96, 5.95, 5.8, 5.8, 5.85, 5.44, 5.97, 5.88, 5.75, 5.92, 5.39, 5.44, 5.31, 5.22, 5.18, 5.28, 5.13, 5, 5.32, 5.19, 5.31, 5.27, 5.36, 5, 5.26, 5.22, 5, 5.22, 5.31, 5.31, 5.49, 5.31, 5.05, 5.18, 5.05, 5.06, 5.27, 5.34, 5.13, 5.09, 5.16, 4.96, 5, 5.09, 5.18, 5.06, 5.09, 4.96, 5, 5.18, 4.83, 5.15, 5.16, 5.13, 5.32, 5.18, 5.35, 4.96, 4.96, 4.79, 5.04, 5, 5.13, 5.09, 5.01, 4.91, 4.96, 5.09, 5.23, 4.83, 5.04, 4.75, 4.83, 5, 5, 4.87, 5, 5.06, 5.04, 5.06 )
str(seeds)
## 'data.frame': 210 obs. of 7 variables:
## $ area : num 15.3 14.9 14.3 13.8 16.1 ...
## $ perimeter : num 14.8 14.6 14.1 13.9 15 ...
## $ compactness : num 0.87 0.88 0.9 0.9 0.9 0.9 0.88 0.89 0.87 0.89 ...
## $ length : num 5.76 5.55 5.29 5.32 5.66 5.39 5.56 5.42 6.05 5.88 ...
## $ width : num 3.31 3.33 3.34 3.38 3.56 3.31 3.26 3.3 3.46 3.5 ...
## $ asymmetry : num 2.22 1.02 2.7 2.26 1.36 2.46 3.59 2.7 2.04 1.97 ...
## $ groove_length: num 5.22 4.96 4.83 4.8 5.17 4.96 5.22 5 5.88 5.53 ...
seeds_type <- c( 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 )
# Do k-means clustering with three clusters, repeat 20 times: seeds_km
seeds_km <- kmeans(seeds, centers=3, nstart=20)
# Print out seeds_km
seeds_km
## K-means clustering with 3 clusters of sizes 72, 77, 61
##
## Cluster means:
## area perimeter compactness length width asymmetry groove_length
## 1 14.64847 14.46042 0.8794444 5.563333 3.277639 2.649306 5.192778
## 2 11.96442 13.27481 0.8529870 5.229481 2.872857 4.759870 5.088442
## 3 18.72180 16.29738 0.8855738 6.209016 3.721967 3.603607 6.065902
##
## Clustering vector:
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 2 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1
## [36] 1 1 3 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 1 1 1 1 1 2
## [71] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 3 3 3 3
## [106] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 3 1 3 3 3 3 3 3 3 1 1 1 1 3 1 1 1
## [141] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [176] 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2
##
## Within cluster sum of squares by cluster:
## [1] 207.4138 195.7171 184.0488
## (between_SS / total_SS = 78.4 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
# Compare clusters with actual seed types. Set k-means clusters as rows
table(seeds_km$cluster, seeds_type)
## seeds_type
## 1 2 3
## 1 60 10 2
## 2 9 0 68
## 3 1 60 0
# Plot the length as function of width. Color by cluster
plot(x=seeds$width, y=seeds$length, col=seeds_km$cluster)
# Apply kmeans to seeds twice: seeds_km_1 and seeds_km_2
seeds_km_1 <- kmeans(seeds, centers=5, nstart=1)
seeds_km_2 <- kmeans(seeds, centers=5, nstart=1)
# Return the ratio of the within cluster sum of squares
seeds_km_1$tot.withinss / seeds_km_2$tot.withinss
## [1] 1.062865
# Compare the resulting clusters
table(seeds_km_1$cluster, seeds_km_2$cluster)
##
## 1 2 3 4 5
## 1 0 14 0 27 0
## 2 0 0 48 0 6
## 3 21 29 0 0 0
## 4 21 0 0 0 0
## 5 0 0 0 19 25
school_result <- data.frame(reading.4=c( 2.7, 3.9, 4.8, 3.1, 3.4, 3.1, 4.6, 3.1, 3.8, 5.2, 3.9, 4.1,
5.7, 3, 2.9, 3.4, 4, 3, 4, 3, 3.6, 3.1, 3.2, 3, 3.8
),
arithmetic.4=c( 3.2, 3.8, 4.1, 3.5, 3.7, 3.4, 4.4, 3.3, 3.7, 4.9, 3.8, 4,
5.1, 3.2, 3.3, 3.3, 4.2, 3, 4.1, 3.2, 3.6, 3.2, 3.3, 3.4, 4
),
reading.6=c( 4.5, 5.9, 6.8, 4.3, 5.1, 4.1, 6.6, 4, 4.7, 8.2, 5.2, 5.6, 7,
4.5, 4.5, 4.4, 5.2, 4.6, 5.9, 4.4, 5.3, 4.6, 5.4, 4.2, 6.9
),
arithmetic.6=c( 4.8, 6.2, 5.5, 4.6, 5.6, 4.7, 6.1, 4.9, 4.9, 6.9, 5.4, 5.6,
6.3, 5, 5.1, 5, 5.4, 5, 5.8, 5.1, 5.4, 5, 5.3, 4.7, 6.7
)
)
# Explore the structure of your data
str(school_result)
## 'data.frame': 25 obs. of 4 variables:
## $ reading.4 : num 2.7 3.9 4.8 3.1 3.4 3.1 4.6 3.1 3.8 5.2 ...
## $ arithmetic.4: num 3.2 3.8 4.1 3.5 3.7 3.4 4.4 3.3 3.7 4.9 ...
## $ reading.6 : num 4.5 5.9 6.8 4.3 5.1 4.1 6.6 4 4.7 8.2 ...
## $ arithmetic.6: num 4.8 6.2 5.5 4.6 5.6 4.7 6.1 4.9 4.9 6.9 ...
# Initialise ratio_ss
ratio_ss <- rep(0, 7)
# Finish the for-loop.
for (k in 1:7) {
# Apply k-means to school_result: school_km
school_km <- kmeans(school_result, centers=k, nstart=20)
# Save the ratio between of WSS to TSS in kth element of ratio_ss
ratio_ss[k] <- school_km$tot.withinss / school_km$totss
}
# Make a scree plot with type "b" and xlab "k"
plot(ratio_ss, type="b", xlab="k")
run_record <- data.frame(X100m=c( 10.23, 9.93, 10.15, 10.14, 10.27, 10, 9.84, 10.1, 10.17, 10.29, 10.97, 10.32, 10.24, 10.29, 10.16, 10.21, 10.02, 10.06, 9.87, 10.11, 10.32, 10.08, 10.33, 10.2, 10.35, 10.2, 10.01, 10, 10.28, 10.34, 10.6, 10.41, 10.3, 10.13, 10.21, 10.64, 10.19, 10.11, 10.08, 10.4, 10.57, 10, 9.86, 10.21, 10.11, 10.78, 10.37, 10.17, 10.18, 10.16, 10.36, 10.23, 10.38, 9.78 )
)
run_record$X200m <- c( 20.37, 20.06, 20.45, 20.19, 20.3, 19.89, 20.17, 20.15, 20.42, 20.85, 22.46, 20.96, 20.61, 20.52, 20.65, 20.47, 20.16, 20.23, 19.94, 19.85, 21.09, 20.11, 20.73, 20.93, 20.54, 20.89, 19.72, 20.03, 20.43, 20.41, 21.23, 20.77, 20.92, 20.06, 20.4, 21.52, 20.19, 20.42, 20.17, 21.18, 21.43, 19.98, 20.12, 20.75, 20.23, 21.86, 21.14, 20.59, 20.43, 20.41, 20.81, 20.69, 21.04, 19.32
)
run_record$X400m <- c( 46.18, 44.38, 45.8, 45.02, 45.26, 44.29, 44.72, 45.92, 45.25, 45.84, 51.4, 46.42, 45.77, 45.89, 44.9, 45.49, 44.64, 44.33, 44.36, 45.57, 48.44, 45.43, 45.48, 46.37, 45.58, 46.59, 45.26, 44.78, 44.18, 45.37, 46.95, 47.9, 46.41, 44.69, 44.31, 48.63, 45.68, 46.09, 46.11, 46.77, 45.57, 44.62, 46.11, 45.77, 44.6, 49.98, 47.6, 44.96, 45.54, 44.99, 46.72, 46.05, 46.63, 43.18
)
run_record$X800m <- c( 106.2, 104.4, 106.2, 103.8, 107.4, 102, 105, 105.6, 106.2, 108, 116.4, 112.2, 105, 101.4, 108.6, 104.4, 103.2, 103.8, 102, 105, 109.2, 105.6, 105.6, 109.8, 105, 108, 103.8, 106.2, 102, 104.4, 109.2, 105.6, 107.4, 108, 106.8, 108, 103.8, 104.4, 102.6, 108, 108, 103.2, 105, 105.6, 102.6, 116.4, 110.4, 103.8, 105.6, 102.6, 107.4, 108.6, 106.8, 102.6
)
run_record$X1500m <- c( 220.8, 211.8, 214.8, 214.2, 222, 214.2, 211.8, 219, 216.6, 223.2, 254.4, 230.4, 214.8, 211.2, 223.8, 216.6, 208.8, 211.8, 209.4, 216.6, 224.4, 215.4, 217.8, 226.2, 213.6, 222, 213, 217.2, 206.4, 218.4, 226.2, 220.2, 225.6, 229.8, 217.8, 228, 213, 212.4, 217.2, 240, 229.2, 215.4, 210, 214.2, 212.4, 240.6, 231.6, 208.8, 216.6, 211.8, 226.2, 226.2, 215.4, 207.6
)
run_record$X5000m <- c( 799.8, 775.8, 795.6, 769.8, 878.4, 808.8, 793.8, 803.4, 805.2, 809.4, 1002, 825, 805.2, 805.2, 858.6, 796.2, 778.8, 774.6, 780.6, 808.8, 838.8, 807, 810, 852.6, 784.2, 819.6, 785.4, 793.2, 759.6, 830.4, 834, 818.4, 846.6, 849, 787.8, 851.4, 793.2, 792.6, 786.6, 883.2, 838.2, 797.4, 783, 795, 792, 976.8, 897.6, 782.4, 797.4, 787.8, 834.6, 855, 807, 778.2
)
run_record$X10000m <- c( 1659, 1651.8, 1663.2, 1612.2, 1829.4, 1687.8, 1656, 1685.4, 1690.2, 1672.8, 2122.8, 1728.6, 1668, 1674.6, 1825.8, 1651.2, 1642.8, 1641.6, 1638, 1687.2, 1760.4, 1681.8, 1728.6, 1779, 1666.8, 1723.2, 1636.8, 1654.8, 1587.6, 1710.6, 1707, 1726.2, 1770, 1790.4, 1628.4, 1777.2, 1646.4, 1662, 1652.4, 1881.6, 1742.4, 1673.4, 1632.6, 1660.2, 1674, 2082.6, 1879.2, 1634.4, 1675.8, 1674, 1752, 1780.2, 1699.8, 1633.8
)
run_record$marathon <- c( 7774.2, 7650.6, 7933.2, 7632, 8782.2, 7563, 7805.4, 7931.4, 7750.8, 7870.2, 10275.6, 7993.8, 7894.2, 7765.8, 8760, 7869, 7581.6, 7708.2, 7627.8, 7922.4, 7951.8, 7926, 7920, 8350.8, 7749, 8052.6, 7637.4, 7569.6, 7473, 7632, 7755.6, 8041.8, 8956.2, 8584.2, 7631.4, 8374.2, 7698.6, 7715.4, 7810.2, 8887.8, 8306.4, 7753.8, 7581.6, 7938, 7749.6, 9690, 8653.2, 7633.8, 7822.8, 7773.6, 8061, 8359.8, 7815, 7522.8
)
rownames(run_record) <- c( 'Argentina', 'Australia', 'Austria', 'Belgium', 'Bermuda', 'Brazil', 'Canada', 'Chile', 'China', 'Columbia', 'CookIslands', 'CostaRica', 'CzechRepublic', 'Denmark', 'DominicanRepub', 'Finland', 'France', 'Germany', 'GreatBritain', 'Greece', 'Guatemala', 'Hungary', 'India', 'Indonesia', 'Ireland', 'Israel', 'Italy', 'Japan', 'Kenya', 'Korea,South', 'Korea,North', 'Luxembourg', 'Malaysia', 'Mauritius', 'Mexico', 'Myanmar(Burma)', 'Netherlands', 'NewZealand', 'Norway', 'PapuaNewGuinea', 'Philippines', 'Poland', 'Portugal', 'Romania', 'Russia', 'Samoa', 'Singapore', 'Spain', 'Sweden', 'Switzerland', 'Taiwan', 'Thailand', 'Turkey', 'U.S.A.'
)
# Explore your data with str() and summary()
str(run_record)
## 'data.frame': 54 obs. of 8 variables:
## $ X100m : num 10.23 9.93 10.15 10.14 10.27 ...
## $ X200m : num 20.4 20.1 20.4 20.2 20.3 ...
## $ X400m : num 46.2 44.4 45.8 45 45.3 ...
## $ X800m : num 106 104 106 104 107 ...
## $ X1500m : num 221 212 215 214 222 ...
## $ X5000m : num 800 776 796 770 878 ...
## $ X10000m : num 1659 1652 1663 1612 1829 ...
## $ marathon: num 7774 7651 7933 7632 8782 ...
summary(run_record)
## X100m X200m X400m X800m
## Min. : 9.78 Min. :19.32 Min. :43.18 Min. :101.4
## 1st Qu.:10.10 1st Qu.:20.17 1st Qu.:44.91 1st Qu.:103.8
## Median :10.20 Median :20.43 Median :45.58 Median :105.6
## Mean :10.22 Mean :20.54 Mean :45.83 Mean :106.1
## 3rd Qu.:10.32 3rd Qu.:20.84 3rd Qu.:46.32 3rd Qu.:108.0
## Max. :10.97 Max. :22.46 Max. :51.40 Max. :116.4
## X1500m X5000m X10000m marathon
## Min. :206.4 Min. : 759.6 Min. :1588 Min. : 7473
## 1st Qu.:213.0 1st Qu.: 788.9 1st Qu.:1653 1st Qu.: 7701
## Median :216.6 Median : 805.2 Median :1675 Median : 7819
## Mean :219.2 Mean : 817.1 Mean :1712 Mean : 8009
## 3rd Qu.:224.2 3rd Qu.: 834.5 3rd Qu.:1739 3rd Qu.: 8050
## Max. :254.4 Max. :1002.0 Max. :2123 Max. :10276
# Cluster run_record using k-means: run_km. 5 clusters, repeat 20 times
run_km <- kmeans(run_record, centers=5, nstart=20)
# Plot the 100m as function of the marathon. Color using clusters
plot(x=run_record$marathon, y=run_record$X100m, col=run_km$cluster)
# Calculate Dunn's index: dunn_km. Print it.
(dunn_km <- clValid::dunn(clusters=run_km$cluster, Data=run_record))
## [1] 0.05954843
# Standardize run_record, transform to a dataframe: run_record_sc
run_record_sc <- as.data.frame( scale(run_record) )
# Cluster run_record_sc using k-means: run_km_sc. 5 groups, let R start over 20 times
run_km_sc <- kmeans(run_record_sc, centers=5, nstart=20)
# Plot records on 100m as function of the marathon. Color using the clusters in run_km_sc
plot(x=run_record$marathon, y=run_record$X100m, col=run_km_sc$cluster,
xlab="Marathon", ylab="100 metres"
)
# Compare the resulting clusters in a nice table
table(run_km$cluster, run_km_sc$cluster)
##
## 1 2 3 4 5
## 1 0 0 2 2 0
## 2 0 0 0 6 0
## 3 3 15 8 0 0
## 4 0 0 0 0 2
## 5 11 5 0 0 0
# Calculate Dunn's index: dunn_km_sc. Print it.
(dunn_km_sc <- clValid::dunn(clusters=run_km_sc$cluster, Data=run_record_sc))
## [1] 0.1453556
# Apply dist() to run_record_sc: run_dist
run_dist <- dist(run_record_sc)
# Apply hclust() to run_dist: run_single
run_single <- hclust(run_dist, method="single")
# Apply cutree() to run_single: memb_single
memb_single <- cutree(run_single, k=5)
# Apply plot() on run_single to draw the dendrogram
plot(run_single)
# Apply rect.hclust() on run_single to draw the boxes
rect.hclust(run_single, k=5, border=2:6)
# Apply hclust() to run_dist: run_complete
run_complete <- hclust(run_dist, method="complete")
# Apply cutree() to run_complete: memb_complete
memb_complete <- cutree(run_complete, k=5)
# Apply plot() on run_complete to draw the dendrogram
plot(run_complete)
# Apply rect.hclust() on run_complete to draw the boxes
rect.hclust(run_complete, k=5, border=2:6)
# table() the clusters memb_single and memb_complete. Put memb_single in the rows
table(memb_single, memb_complete)
## memb_complete
## memb_single 1 2 3 4 5
## 1 27 7 14 0 1
## 2 0 0 0 1 0
## 3 0 0 0 0 1
## 4 0 0 0 0 2
## 5 0 0 0 1 0
# Dunn's index for k-means: dunn_km
dunn_km <- clValid::dunn(clusters=run_km_sc$cluster, Data=run_record_sc)
# Dunn's index for single-linkage: dunn_single
dunn_single <- clValid::dunn(clusters=memb_single, Data=run_record_sc)
# Dunn's index for complete-linkage: dunn_complete
dunn_complete <- clValid::dunn(clusters=memb_complete, Data=run_record_sc)
# Compare k-means with single-linkage
table(run_km_sc$cluster, memb_single)
## memb_single
## 1 2 3 4 5
## 1 14 0 0 0 0
## 2 20 0 0 0 0
## 3 9 0 1 0 0
## 4 6 0 0 2 0
## 5 0 1 0 0 1
# Compare k-means with complete-linkage
table(run_km_sc$cluster, memb_complete)
## memb_complete
## 1 2 3 4 5
## 1 7 7 0 0 0
## 2 20 0 0 0 0
## 3 0 0 8 0 2
## 4 0 0 6 0 2
## 5 0 0 0 2 0
crime_data <- data.frame(murder=c( 13.2, 10, 8.1, 8.8, 9, 7.9, 3.3, 5.9, 15.4, 17.4, 5.3, 2.6, 10.4, 7.2, 2.2, 6, 9.7, 15.4, 2.1, 11.3, 4.4, 12.1, 2.7, 16.1, 9, 6, 4.3, 12.2, 2.1, 7.4, 11.4, 11.1, 13, 0.8, 7.3, 6.6, 4.9, 6.3, 3.4, 14.4, 3.8, 13.2, 12.7, 3.2, 2.2, 8.5, 4, 5.7, 2.6, 6.8 )
)
crime_data$assault <- c( 236, 263, 294, 190, 276, 204, 110, 238, 335, 211, 46, 120, 249, 113, 56, 115, 109, 249, 83, 300, 149, 255, 72, 259, 178, 109, 102, 252, 57, 159, 285, 254, 337, 45, 120, 151, 159, 106, 174, 279, 86, 188, 201, 120, 48, 156, 145, 81, 53, 161
)
crime_data$urb_pop <- c( 58, 48, 80, 50, 91, 78, 77, 72, 80, 60, 83, 54, 83, 65, 57, 66, 52, 66, 51, 67, 85, 74, 66, 44, 70, 53, 62, 81, 56, 89, 70, 86, 45, 44, 75, 68, 67, 72, 87, 48, 45, 59, 80, 80, 32, 63, 73, 39, 66, 60
)
crime_data$rape <- c( 21.2, 44.5, 31, 19.5, 40.6, 38.7, 11.1, 15.8, 31.9, 25.8, 20.2, 14.2, 24, 21, 11.3, 18, 16.3, 22.2, 7.8, 27.8, 16.3, 35.1, 14.9, 17.1, 28.2, 16.4, 16.5, 46, 9.5, 18.8, 32.1, 26.1, 16.1, 7.3, 21.4, 20, 29.3, 14.9, 8.3, 22.5, 12.8, 26.9, 25.5, 22.9, 11.2, 20.7, 26.2, 9.3, 10.8, 15.6
)
rownames(crime_data) <- c( 'Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming'
)
str(crime_data)
## 'data.frame': 50 obs. of 4 variables:
## $ murder : num 13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
## $ assault: num 236 263 294 190 276 204 110 238 335 211 ...
## $ urb_pop: num 58 48 80 50 91 78 77 72 80 60 ...
## $ rape : num 21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...
# Scale the dataset: crime_data_sc
crime_data_sc <- as.data.frame(scale(crime_data))
# Perform k-means clustering: crime_km
crime_km <- kmeans(crime_data_sc, centers=4, nstart=20)
# Perform single-linkage hierarchical clustering
## Calculate the distance matrix: dist_matrix
dist_matrix <- dist(crime_data_sc)
## Calculate the clusters using hclust(): crime_single
crime_single <- hclust(dist_matrix, method="single")
## Cut the clusters using cutree: memb_single
memb_single <- cutree(crime_single, k=4)
# Calculate the Dunn's index for both clusterings: dunn_km, dunn_single
dunn_km <- clValid::dunn(clusters=crime_km$cluster, Data=crime_data_sc)
dunn_single <- clValid::dunn(clusters=memb_single, Data=crime_data_sc)
# Print out the results
dunn_km
## [1] 0.1604403
dunn_single
## [1] 0.2438734
table(crime_km$cluster, memb_single)
## memb_single
## 1 2 3 4
## 1 9 1 2 1
## 2 8 0 0 0
## 3 16 0 0 0
## 4 13 0 0 0
Chapter 1 - Unsupervised Learning in R
Introduction to the main types of machine learning:
Introduction to k-means clustering - assume a number of sub-groups, then iteratively assign/update the clusters/centroids:
How kmeans works and practical matters:
Introduction to the Pokemon data - 800 Pokemon each with 6 features:
Example code includes:
x <- matrix(data=NA, nrow=300, ncol=2)
x[,1] <- c( 3.37, 1.44, 2.36, 2.63, 2.4, 1.89, 3.51, 1.91, 4.02, 1.94, 3.3, 4.29, 0.61, 1.72, 1.87, 2.64, 1.72, -0.66, -0.44, 3.32, 1.69, 0.22, 1.83, 3.21, 3.9, 1.57, 1.74, 0.24, 2.46, 1.36, 2.46, 2.7, 3.04, 1.39, 2.5, 0.28, 1.22, 1.15, -0.41, 2.04, 2.21, 1.64, 2.76, 1.27, 0.63, 2.43, 1.19, 3.44, 1.57, 2.66, 2.32, 1.22, 3.58, 2.64, 2.09, 2.28, 2.68, 2.09, -0.99, 2.28, 1.63, 2.19, 2.58, 3.4, 1.27, 3.3, 2.34, 3.04, 2.92, 2.72, 0.96, 1.91, 2.62, 1.05, 1.46, 2.58, 2.77, 2.46, 1.11, 0.9, 3.51, 2.26, 2.09, 1.88, 0.81, 2.61, 1.78, 1.82, 2.93, 2.82, 3.39, 1.52, 2.65, 3.39, 0.89, 1.14, 0.87, 0.54, 2.08, 2.65, -3.8, -3.96, -6, -3.15, -5.67, -4.89, -5.42, -5.12, -4.81, -4.88, -5.03, -4.89, -5.49, -5.5, -6.66, -5.38, -5.51, -2.3, -6.36, -4.86, -6.49, -6.47, -4.88, -6, -5, -5.43, -5.61, -7.02, -6.22, -4.82, -4.43, -5.49, -5, -3.88, -3.56, -6.1, -5.12, -3.8, -5.47, -5.05, -5.09, -5.89, -5.44, -5.03, -5.41, -3.89, -5.48, -5.43, -4.3, -6.06, -5.04, -6.55, -3.83, -5.27, -5.47, -6.24, -5.01, -5.8, -5.53, -3.71, -5.18, -6.07, -4.84, -5.36, -4.41, -3.57, -5.99, -4.55, -4.92, -4.1, -5.23, -4.16, -6.75, -3.31, -4.14, -5.15, -6.45, -4.36, -4.52, -5.01, -4.85, -5.58, -4.63, -4.71, -5.28, -6.34, -4.3, -4.45, -5.84, -6.59, -4.8, -5.35, -4.75, -6.29, -5.96, -3.91, -4.6, -4.41, -3.18, -4.87, -7, -4.67, -3.83, -2.94, -6.38, -6.15, -5.71, -6.05, -5.65, -5.19, -6.2, -2.96, -4.89, -5.08, -4.5, -4.96, -5.13, -3.52, -5.22, -6.28, -4.61, -5.35, -5.52, -6.07, -4.57, -5.17, -4.48, -5.23, -5.66, -3.75, -5.27, -4.05, -6.2, -5.47, -5.27, -5.39, -3.65, -5.02, -4.76, -5.94, -5.73, -4, -3.74, -3.75, -6.38, -2.95, -3.98, -5.03, -4.3, -5.97, -0.1, 1.05, -0.2, 1.19, 2.3, -0.03, 0.26, 1.05, -0.02, 0.62, 1.87, 1.97, 1.38, -0.85, 0.95, 2.06, 1.81, 0.81, -1.7, 1.06, 1.57, 1.05, 1.16, 1.43, 0.6, 2.31, 1.47, -0.24, 2.38, 2.2, 1.82, -0.66, 0.43, 1.64, 1.04, 1.35, 3.46, 0.18, -1.11, 1.27, 0.31, 1.45, 0.19, 3.21, 0.88, 0.52, 0.83, 1.86, 1.1, -0.63 )
x[,2] <- c( 2, 2.76, 2.04, 2.74, 1.85, 1.94, 2.48, 2.99, 0.75, 1.97, 1.93, 1.24, 0.97, 1.37, 2.59, 1.58, 1.22, 2.16, 0.76, 3.05, 1.52, 2.19, 2.05, 2, 3.81, 1.17, 3.15, 2.03, 1.16, 1.93, 2.75, 1.57, 1.23, 2.15, 2.99, 1.93, 0.61, 0.69, 1.23, 1.47, 1.98, 2.67, 1.57, 0.89, 2.61, 2.28, 3.16, 0.32, 2.09, 3.35, 2.72, 1.17, 2.73, 1.13, 1.55, 3.19, 1.71, 2.83, 1.71, 0.42, 1.15, 0.91, 1.52, 1.66, 1.85, 1.76, 3.89, 0.61, 1.59, 2.35, 3.63, 2.09, 3.24, 0.36, 3.45, 1.31, 1.72, 0.89, 2.13, 3.79, 4.42, 0.92, 2.49, 3.39, 1.8, 1.78, 1.7, 2.6, 3.4, 2.69, 2.32, 1.7, 2.5, 1.45, 1.72, 3.1, 2.44, 2.24, 1.74, 2.93, 3.33, 1.13, 2.06, 2.05, 1.42, 1, 2, 2.66, 3.48, 0.09, 1.3, 1.69, 0.34, 1.25, 1.22, 1.28, -0.19, 2.21, 1.37, 3.52, 2.8, 0.55, 2.1, 1.41, 2.89, 2.05, 1.44, 2.44, 2.15, 1.84, 4.02, 1.47, 1.53, 0.45, 1.96, 2.89, -0.07, 1.75, 0.82, 3.44, 3.36, 2.33, 3.43, 1.13, 2.95, 1.41, 2.32, 1.7, 1.72, 2.55, 0.7, 1.75, 2.17, 1.6, 2.1, 1.68, 3.62, 2.71, 4.97, 1.2, 2.81, 4.1, 2.3, 0.92, 0.99, 1.96, 3.31, 2.75, -0.14, 1.3, 1.99, 0.54, 2.69, -0.46, 2.14, 1.61, 1.51, 1.72, 2.31, 2.4, 1.77, 0.08, 0.56, 0.53, 2.76, 1.76, 2.27, 0.44, 1.46, 2.56, 1.82, 1.88, 1.93, 3.21, 1.39, 2.68, 2.9, 0.81, 2.12, 1.99, 3.03, 2.91, 2, 2.14, 1.28, 1.8, 0.97, 1.03, 0.78, 2.84, 3.11, 1.59, 0.87, 1.91, 4.24, 4.04, 0.28, 1.64, 3.53, 1.96, 3.6, 1.67, 2.6, 2.22, 5.23, 2.92, 0.79, 1.4, 2.37, 0.1, 0.2, 0.88, 1.65, 3.24, 1.73, 2.16, 1.94, 1.29, 3.36, 0.9, 1.77, 1.65, 2.53, 3.61, 2.51, 3.38, 2.76, 1.38, 2.08, 3.38, -1.56, 0.32, -0.16, 0.88, 0.75, 0.3, 1.49, -1.53, 0.91, -1.58, 0.59, 0.09, 0.97, 0.08, -1.57, -2.01, 0.54, -0.07, -0.57, -0.31, -0.67, -0.16, -0.93, -1.98, -0.22, 1.05, 1.88, 0, -0.08, 0.96, 0.05, -0.43, -1.74, -1.26, 0.41, -1.46, 1.05, -1.35, -0.19, 0, -0.01, 0.15, 0.6, -0.13, -0.25, 0.16, -0.43, 1.54, -2.17, 1.03 )
str(x)
## num [1:300, 1:2] 3.37 1.44 2.36 2.63 2.4 1.89 3.51 1.91 4.02 1.94 ...
# Create the k-means model: km.out
km.out <- kmeans(x, centers=3, nstart=20)
# Inspect the result
summary(km.out)
## Length Class Mode
## cluster 300 -none- numeric
## centers 6 -none- numeric
## totss 1 -none- numeric
## withinss 3 -none- numeric
## tot.withinss 1 -none- numeric
## betweenss 1 -none- numeric
## size 3 -none- numeric
## iter 1 -none- numeric
## ifault 1 -none- numeric
# Print the cluster membership component of the model
km.out$cluster
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 3 3 3 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1
## [71] 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2
## [106] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [141] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [176] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [211] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [246] 2 2 2 2 2 3 3 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 3 3 1
## [281] 3 3 3 3 3 3 1 3 3 3 3 3 3 1 3 3 3 1 3 3
# Print the km.out object
km.out
## K-means clustering with 3 clusters of sizes 98, 150, 52
##
## Cluster means:
## [,1] [,2]
## 1 2.2170408 2.05153061
## 2 -5.0554667 1.96973333
## 3 0.6642308 -0.09115385
##
## Clustering vector:
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 3 3 3 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1
## [71] 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2
## [106] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [141] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [176] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [211] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [246] 2 2 2 2 2 3 3 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 3 3 1
## [281] 3 3 3 3 3 3 1 3 3 3 3 3 3 1 3 3 3 1 3 3
##
## Within cluster sum of squares by cluster:
## [1] 148.7013 295.1237 95.4708
## (between_SS / total_SS = 87.2 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
# Scatter plot of x
plot(x, col=km.out$cluster, main="k-means with 3 clusters", xlab="", ylab="")
# Set up 2 x 3 plotting grid
par(mfrow = c(2, 3))
for(i in 1:6) {
# Run kmeans() on x with three clusters and one start
km.out <- kmeans(x, centers=3, nstart=1)
# Plot clusters
plot(x, col = km.out$cluster,
main = km.out$tot.withinss,
xlab = "", ylab = "")
}
par(mfrow = c(1, 1))
# Initialize total within sum of squares error: wss
wss <- 0
# For 1 to 15 cluster centers
for (i in 1:15) {
km.out <- kmeans(x, centers = i, nstart=20)
# Save total within sum of squares to wss variable
wss[i] <- km.out$tot.withinss
}
# Plot total within sum of squares vs. number of clusters
plot(1:15, wss, type = "b",
xlab = "Number of Clusters",
ylab = "Within groups sum of squares")
pokemon <- matrix(data=NA, nrow=800, ncol=6)
v1 <- c(45, 60, 80, 80, 39, 58, 78, 78, 78, 44, 59, 79, 79, 45, 50, 60, 40, 45, 65, 65, 40, 63, 83, 83, 30, 55, 40, 65, 35, 60, 35, 60, 50, 75, 55, 70, 90, 46, 61, 81, 70, 95, 38, 73, 115, 140, 40, 75, 45, 60, 75, 35, 60, 60, 70, 10, 35, 40, 65, 50, 80, 40, 65, 55, 90, 40, 65, 90, 25, 40, 55, 55, 70, 80, 90, 50, 65, 80, 40, 80, 40, 55, 80, 50, 65, 90, 95, 95, 25, 50, 52, 35, 60, 65, 90, 80, 105, 30, 50, 30, 45, 60, 60, 35, 60, 85, 30, 55, 40, 60, 60, 95, 50, 60, 50, 50, 90, 40, 65, 80, 105, 250, 65, 105, 105, 30, 55, 45, 80, 30, 60, 40, 70, 65, 65, 65, 65, 65, 75, 20, 95, 95, 130, 48, 55, 130, 65, 65, 65, 35, 70, 30, 60, 80, 80, 160, 90, 90, 90, 41, 61, 91, 106, 106, 106, 100, 45, 60, 80, 39, 58, 78, 50, 65, 85, 35, 85, 60, 100, 40, 55, 40, 70, 85, 75, 125, 20, 50, 90, 35, 55, 40, 65, 55, 70, 90, 90, 75, 70, 100, 70, 90, 35, 55, 75, 55, 30, 75, 65, 55, 95, 65, 95, 60, 95, 60, 48, 190, 70, 50, 75, 100, 65, 75, 75, 60, 90, 65, 70, 70, 20, 80, 80, 55, 60, 90, 40, 50, 50, 100, 55, 35, 75, 45, 65, 65, 45, 75, 75, 75, 90, 90, 85, 73, 55, 35, 50, 45, 45, 45, 95, 255, 90, 115, 100, 50, 70, 100, 100, 106, 106, 100, 40, 50, 70, 70, 45, 60, 80, 80, 50, 70, 100, 100, 35, 70, 38, 78, 45, 50, 60, 50, 60, 40, 60, 80, 40, 70, 90, 40, 60, 40, 60, 28, 38, 68, 68, 40, 70, 60, 60, 60, 80, 150, 31, 61, 1, 64, 84, 104, 72, 144, 50, 30, 50, 70, 50, 50, 50, 50, 50, 60, 70, 70, 30, 60, 60, 40, 70, 70, 60, 60, 65, 65, 50, 70, 100, 45, 70, 70, 130, 170, 60, 70, 70, 70, 60, 80, 60, 45, 50, 80, 50, 70, 45, 75, 75, 73, 73, 70, 70, 50, 110, 43, 63, 40, 60, 66, 86, 45, 75, 20, 95, 70, 60, 44, 64, 64, 20, 40, 99, 65, 65, 65, 95, 50, 80, 80, 70, 90, 110, 35, 55, 55, 100, 43, 45, 65, 95, 95, 40, 60, 80, 80, 80, 80, 80, 80, 80, 80, 80, 100, 100, 100, 100, 105, 105, 100, 50, 50, 50, 50, 55, 75, 95, 44, 64, 76, 53, 64, 84, 40, 55, 85, 59, 79, 37, 77, 45, 60, 80, 40, 60, 67, 97, 30, 60, 40, 60, 60, 60, 70, 30, 70, 60, 55, 85, 45, 70, 76, 111, 75, 90, 150, 55, 65, 65, 60, 100, 49, 71, 45, 63, 103, 57, 67, 50, 20, 100, 76, 50, 58, 68, 108, 108, 135, 40, 70, 70, 68, 108, 40, 70, 48, 83, 74, 49, 69, 45, 60, 90, 90, 70, 70, 110, 115, 100, 75, 75, 85, 86, 65, 65, 75, 110, 85, 68, 68, 60, 45, 70, 50, 50, 50, 50, 50, 50, 75, 80, 75, 100, 90, 91, 110, 150, 150, 120, 80, 100, 70, 100, 100, 120, 100, 45, 60, 75, 65, 90, 110, 55, 75, 95, 45, 60, 45, 65, 85, 41, 64, 50, 75, 50, 75, 50, 75, 76, 116, 50, 62, 80, 45, 75, 55, 70, 85, 55, 67, 60, 110, 103, 103, 75, 85, 105, 50, 75, 105, 120, 75, 45, 55, 75, 30, 40, 60, 40, 60, 45, 70, 70, 50, 60, 95, 70, 105, 105, 75, 50, 70, 50, 65, 72, 38, 58, 54, 74, 55, 75, 50, 80, 40, 60, 55, 75, 45, 60, 70, 45, 65, 110, 62, 75, 36, 51, 71, 60, 80, 55, 50, 70, 69, 114, 55, 100, 165, 50, 70, 44, 74, 40, 60, 60)
pokemon[, 1] <- c( v1, 35, 65, 85, 55, 75, 50, 60, 60, 46, 66, 76, 55, 95, 70, 50, 80, 109, 45, 65, 77, 59, 89, 45, 65, 95, 70, 100, 70, 110, 85, 58, 52, 72, 92, 55, 85, 91, 91, 91, 79, 79, 79, 79, 100, 100, 89, 89, 125, 125, 125, 91, 91, 100, 100, 71, 56, 61, 88, 40, 59, 75, 41, 54, 72, 38, 85, 45, 62, 78, 38, 45, 80, 62, 86, 44, 54, 78, 66, 123, 67, 95, 75, 62, 74, 74, 45, 59, 60, 60, 78, 101, 62, 82, 53, 86, 42, 72, 50, 65, 50, 71, 44, 62, 58, 82, 77, 123, 95, 78, 67, 50, 45, 68, 90, 57, 43, 85, 49, 44, 54, 59, 65, 55, 75, 85, 55, 95, 40, 85, 126, 126, 108, 50, 50, 80, 80, 80 )
v2 <- c(49, 62, 82, 100, 52, 64, 84, 130, 104, 48, 63, 83, 103, 30, 20, 45, 35, 25, 90, 150, 45, 60, 80, 80, 56, 81, 60, 90, 60, 85, 55, 90, 75, 100, 47, 62, 92, 57, 72, 102, 45, 70, 41, 76, 45, 70, 45, 80, 50, 65, 80, 70, 95, 55, 65, 55, 80, 45, 70, 52, 82, 80, 105, 70, 110, 50, 65, 95, 20, 35, 50, 50, 80, 100, 130, 75, 90, 105, 40, 70, 80, 95, 120, 85, 100, 65, 75, 75, 35, 60, 65, 85, 110, 45, 70, 80, 105, 65, 95, 35, 50, 65, 65, 45, 48, 73, 105, 130, 30, 50, 40, 95, 50, 80, 120, 105, 55, 65, 90, 85, 130, 5, 55, 95, 125, 40, 65, 67, 92, 45, 75, 45, 110, 50, 83, 95, 125, 155, 100, 10, 125, 155, 85, 48, 55, 65, 65, 130, 60, 40, 60, 80, 115, 105, 135, 110, 85, 90, 100, 64, 84, 134, 110, 190, 150, 100, 49, 62, 82, 52, 64, 84, 65, 80, 105, 46, 76, 30, 50, 20, 35, 60, 90, 90, 38, 58, 40, 25, 30, 20, 40, 50, 75, 40, 55, 75, 95, 80, 20, 50, 100, 75, 35, 45, 55, 70, 30, 75, 65, 45, 85, 65, 65, 85, 75, 60, 72, 33, 80, 65, 90, 70, 75, 85, 125, 80, 120, 95, 130, 150, 10, 125, 185, 95, 80, 130, 40, 50, 50, 100, 55, 65, 105, 55, 40, 80, 60, 90, 90, 95, 60, 120, 80, 95, 20, 35, 95, 30, 63, 75, 80, 10, 85, 115, 75, 64, 84, 134, 164, 90, 130, 100, 45, 65, 85, 110, 60, 85, 120, 160, 70, 85, 110, 150, 55, 90, 30, 70, 45, 35, 70, 35, 50, 30, 50, 70, 40, 70, 100, 55, 85, 30, 50, 25, 35, 65, 85, 30, 60, 40, 130, 60, 80, 160, 45, 90, 90, 51, 71, 91, 60, 120, 20, 45, 45, 65, 75, 85, 85, 105, 70, 90, 110, 140, 40, 60, 100, 45, 75, 75, 50, 40, 73, 47, 60, 43, 73, 90, 120, 140, 70, 90, 60, 100, 120, 85, 25, 45, 60, 100, 70, 100, 85, 115, 40, 70, 110, 115, 100, 55, 95, 48, 78, 80, 120, 40, 70, 41, 81, 95, 125, 15, 60, 70, 90, 75, 115, 165, 40, 70, 68, 50, 130, 150, 23, 50, 80, 120, 40, 60, 80, 64, 104, 84, 90, 30, 75, 95, 135, 145, 55, 75, 135, 145, 100, 50, 75, 80, 100, 90, 130, 100, 150, 150, 180, 150, 180, 100, 150, 180, 70, 95, 68, 89, 109, 58, 78, 104, 51, 66, 86, 55, 75, 120, 45, 85, 25, 85, 65, 85, 120, 30, 70, 125, 165, 42, 52, 29, 59, 79, 69, 94, 30, 80, 45, 65, 105, 35, 60, 48, 83, 100, 50, 80, 66, 76, 136, 60, 125, 55, 82, 30, 63, 93, 24, 89, 80, 25, 5, 65, 92, 70, 90, 130, 170, 85, 70, 110, 145, 72, 112, 50, 90, 61, 106, 100, 49, 69, 20, 62, 92, 132, 120, 70, 85, 140, 100, 123, 95, 50, 76, 110, 60, 95, 130, 80, 125, 165, 55, 100, 80, 50, 65, 65, 65, 65, 65, 75, 105, 125, 120, 120, 90, 160, 100, 120, 70, 80, 100, 90, 100, 103, 120, 100, 45, 60, 75, 63, 93, 123, 55, 75, 100, 55, 85, 60, 80, 110, 50, 88, 53, 98, 53, 98, 53, 98, 25, 55, 55, 77, 115, 60, 100, 75, 105, 135, 45, 57, 85, 135, 60, 60, 80, 105, 140, 50, 65, 95, 100, 125, 53, 63, 103, 45, 55, 100, 27, 67, 35, 60, 92, 72, 82, 117, 90, 140, 30, 86, 65, 95, 75, 90, 58, 30, 50, 78, 108, 112, 140, 50, 95, 65, 105, 50, 95, 30, 45, 55, 30, 40, 65, 44, 87, 50, 65, 95, 60, 100, 75, 75, 135, 55, 85, 40, 60, 75, 47, 77, 50, 94, 55, 80, 100)
pokemon[,2] <- c( v2, 55, 85, 115, 55, 75, 30, 40, 55, 87, 117, 147, 70, 110, 50, 40, 70, 66, 85, 125, 120, 74, 124, 85, 125, 110, 83, 123, 55, 65, 97, 109, 65, 85, 105, 85, 60, 90, 129, 90, 115, 100, 115, 105, 120, 150, 125, 145, 130, 170, 120, 72, 72, 77, 128, 120, 61, 78, 107, 45, 59, 69, 56, 63, 95, 36, 56, 50, 73, 81, 35, 22, 52, 50, 68, 38, 45, 65, 65, 100, 82, 124, 80, 48, 48, 48, 80, 110, 150, 50, 52, 72, 48, 80, 54, 92, 52, 105, 60, 75, 53, 73, 38, 55, 89, 121, 59, 77, 65, 92, 58, 50, 50, 75, 100, 80, 70, 110, 66, 66, 66, 66, 90, 85, 95, 100, 69, 117, 30, 70, 131, 131, 100, 100, 160, 110, 160, 110 )
v3 <- c(49, 63, 83, 123, 43, 58, 78, 111, 78, 65, 80, 100, 120, 35, 55, 50, 30, 50, 40, 40, 40, 55, 75, 80, 35, 60, 30, 65, 44, 69, 40, 55, 85, 110, 52, 67, 87, 40, 57, 77, 48, 73, 40, 75, 20, 45, 35, 70, 55, 70, 85, 55, 80, 50, 60, 25, 50, 35, 60, 48, 78, 35, 60, 45, 80, 40, 65, 95, 15, 30, 45, 65, 50, 70, 80, 35, 50, 65, 35, 65, 100, 115, 130, 55, 70, 65, 110, 180, 70, 95, 55, 45, 70, 55, 80, 50, 75, 100, 180, 30, 45, 60, 80, 160, 45, 70, 90, 115, 50, 70, 80, 85, 95, 110, 53, 79, 75, 95, 120, 95, 120, 5, 115, 80, 100, 70, 95, 60, 65, 55, 85, 65, 80, 35, 57, 57, 100, 120, 95, 55, 79, 109, 80, 48, 50, 60, 60, 60, 70, 100, 125, 90, 105, 65, 85, 65, 100, 85, 90, 45, 65, 95, 90, 100, 70, 100, 65, 80, 100, 43, 58, 78, 64, 80, 100, 34, 64, 30, 50, 30, 50, 40, 70, 80, 38, 58, 15, 28, 15, 65, 85, 45, 70, 40, 55, 85, 105, 95, 50, 80, 115, 75, 40, 50, 70, 55, 30, 55, 45, 45, 85, 60, 110, 42, 80, 60, 48, 58, 65, 90, 140, 70, 105, 200, 230, 50, 75, 75, 100, 140, 230, 75, 115, 55, 50, 75, 40, 120, 40, 80, 85, 35, 75, 45, 70, 140, 30, 50, 90, 95, 60, 120, 90, 62, 35, 35, 95, 15, 37, 37, 105, 10, 75, 85, 115, 50, 70, 110, 150, 130, 90, 100, 35, 45, 65, 75, 40, 60, 70, 80, 50, 70, 90, 110, 35, 70, 41, 61, 35, 55, 50, 55, 70, 30, 50, 70, 50, 40, 60, 30, 60, 30, 100, 25, 35, 65, 65, 32, 62, 60, 80, 60, 80, 100, 90, 45, 45, 23, 43, 63, 30, 60, 40, 135, 45, 65, 75, 125, 85, 125, 100, 140, 180, 230, 55, 75, 85, 40, 60, 80, 40, 50, 55, 55, 45, 53, 83, 20, 40, 70, 35, 45, 40, 70, 100, 140, 35, 65, 60, 45, 50, 80, 40, 60, 60, 90, 110, 60, 60, 65, 85, 43, 73, 65, 85, 55, 105, 77, 97, 50, 100, 20, 79, 70, 70, 35, 65, 75, 90, 130, 83, 70, 60, 60, 48, 50, 80, 80, 50, 70, 90, 85, 105, 105, 130, 55, 60, 100, 80, 130, 80, 100, 130, 150, 200, 100, 150, 90, 120, 80, 100, 90, 90, 140, 160, 90, 100, 100, 50, 20, 160, 90, 64, 85, 105, 44, 52, 71, 53, 68, 88, 30, 50, 70, 40, 60, 41, 51, 34, 49, 79, 35, 65, 40, 60, 118, 168, 45, 85, 105, 95, 50, 42, 102, 70, 35, 55, 45, 70, 48, 68, 66, 34, 44, 44, 84, 94, 60, 52, 42, 64, 50, 47, 67, 86, 116, 95, 45, 5, 45, 108, 45, 65, 95, 115, 40, 40, 70, 88, 78, 118, 90, 110, 40, 65, 72, 56, 76, 50, 50, 75, 105, 65, 115, 95, 130, 125, 67, 67, 95, 86, 130, 110, 125, 80, 70, 65, 95, 145, 135, 70, 77, 107, 107, 107, 107, 107, 130, 105, 70, 120, 100, 106, 110, 120, 100, 120, 80, 100, 90, 100, 75, 120, 100, 55, 75, 95, 45, 55, 65, 45, 60, 85, 39, 69, 45, 65, 90, 37, 50, 48, 63, 48, 63, 48, 63, 45, 85, 50, 62, 80, 32, 63, 85, 105, 130, 43, 55, 40, 60, 86, 126, 55, 85, 95, 40, 55, 75, 85, 75, 70, 90, 80, 59, 99, 89, 60, 85, 50, 75, 65, 35, 45, 80, 45, 55, 105, 67, 85, 125, 70, 115, 80, 85, 145, 103, 133, 45, 65, 62, 82, 40, 60, 40, 60, 50, 70, 95, 40, 50, 75, 50, 63, 50, 65, 85, 50, 70, 60, 45, 105, 45, 70, 50, 70, 80, 50, 60, 91, 131, 70, 95, 115, 40)
pokemon[,3] <- c( v3, 70, 80, 55, 75, 55, 60, 90, 60, 70, 90, 40, 80, 30, 85, 40, 84, 50, 60, 90, 50, 80, 70, 100, 95, 50, 75, 75, 105, 66, 112, 50, 70, 90, 55, 65, 129, 90, 72, 70, 80, 70, 70, 100, 120, 90, 90, 90, 100, 90, 90, 90, 77, 90, 95, 65, 95, 122, 40, 58, 72, 40, 52, 67, 38, 77, 43, 55, 71, 40, 60, 50, 58, 72, 39, 47, 68, 48, 62, 62, 78, 60, 54, 76, 76, 100, 150, 50, 150, 60, 72, 66, 86, 53, 88, 67, 115, 60, 90, 62, 88, 33, 52, 77, 119, 50, 72, 65, 75, 57, 150, 35, 53, 70, 91, 48, 76, 70, 70, 70, 70, 122, 122, 122, 122, 85, 184, 35, 80, 95, 95, 121, 150, 110, 60, 60, 120 )
v4 <- c(65, 80, 100, 122, 60, 80, 109, 130, 159, 50, 65, 85, 135, 20, 25, 90, 20, 25, 45, 15, 35, 50, 70, 135, 25, 50, 31, 61, 40, 65, 50, 90, 20, 45, 40, 55, 75, 40, 55, 85, 60, 95, 50, 81, 45, 85, 30, 65, 75, 85, 110, 45, 60, 40, 90, 35, 50, 40, 65, 65, 95, 35, 60, 70, 100, 40, 50, 70, 105, 120, 135, 175, 35, 50, 65, 70, 85, 100, 50, 80, 30, 45, 55, 65, 80, 40, 100, 130, 95, 120, 58, 35, 60, 45, 70, 40, 65, 45, 85, 100, 115, 130, 170, 30, 43, 73, 25, 50, 55, 80, 60, 125, 40, 50, 35, 35, 60, 60, 85, 30, 45, 35, 100, 40, 60, 70, 95, 35, 65, 70, 100, 100, 55, 115, 95, 100, 55, 65, 40, 15, 60, 70, 85, 48, 45, 110, 110, 95, 85, 90, 115, 55, 65, 60, 70, 65, 95, 125, 125, 50, 70, 100, 154, 154, 194, 100, 49, 63, 83, 60, 80, 109, 44, 59, 79, 35, 45, 36, 76, 40, 55, 40, 60, 70, 56, 76, 35, 45, 40, 40, 80, 70, 95, 65, 80, 115, 165, 90, 20, 60, 30, 90, 35, 45, 55, 40, 30, 105, 75, 25, 65, 130, 60, 85, 100, 85, 72, 33, 90, 35, 60, 65, 35, 55, 55, 40, 60, 55, 55, 65, 10, 40, 40, 35, 50, 75, 70, 80, 30, 60, 65, 65, 105, 65, 80, 40, 80, 110, 140, 95, 40, 60, 105, 85, 20, 35, 35, 85, 65, 70, 40, 75, 115, 90, 90, 45, 65, 95, 95, 90, 110, 100, 65, 85, 105, 145, 70, 85, 110, 130, 50, 60, 85, 95, 30, 60, 30, 50, 20, 25, 100, 25, 50, 40, 60, 90, 30, 60, 90, 30, 50, 55, 85, 45, 65, 125, 165, 50, 80, 40, 60, 35, 55, 95, 30, 50, 30, 51, 71, 91, 20, 40, 20, 45, 35, 55, 65, 85, 55, 55, 40, 50, 60, 60, 40, 60, 80, 65, 105, 135, 85, 75, 47, 73, 100, 43, 73, 65, 95, 110, 70, 90, 65, 105, 145, 85, 70, 90, 60, 45, 50, 80, 85, 115, 40, 70, 110, 60, 100, 95, 55, 46, 76, 50, 90, 40, 70, 61, 81, 40, 70, 10, 100, 70, 60, 63, 83, 93, 30, 60, 72, 95, 75, 115, 23, 50, 80, 120, 55, 75, 95, 74, 94, 114, 45, 40, 40, 60, 110, 120, 35, 55, 95, 105, 50, 100, 75, 110, 140, 130, 160, 150, 180, 100, 150, 150, 180, 100, 150, 180, 70, 95, 45, 55, 75, 58, 78, 104, 61, 81, 111, 30, 40, 50, 35, 55, 25, 55, 40, 60, 95, 50, 125, 30, 65, 42, 47, 29, 79, 59, 69, 94, 30, 80, 45, 60, 85, 62, 87, 57, 92, 60, 60, 90, 44, 54, 54, 105, 105, 42, 64, 65, 41, 71, 24, 79, 10, 70, 15, 92, 92, 40, 50, 80, 120, 40, 35, 115, 140, 38, 68, 30, 60, 61, 86, 90, 49, 69, 60, 62, 92, 132, 45, 130, 80, 55, 110, 95, 125, 120, 116, 60, 130, 45, 70, 135, 65, 65, 75, 65, 80, 95, 105, 105, 105, 105, 105, 75, 105, 125, 150, 150, 130, 80, 100, 120, 75, 80, 100, 135, 100, 120, 120, 100, 45, 60, 75, 45, 70, 100, 63, 83, 108, 35, 60, 25, 35, 45, 50, 88, 53, 98, 53, 98, 53, 98, 67, 107, 36, 50, 65, 50, 80, 25, 50, 60, 55, 77, 30, 50, 60, 80, 25, 40, 55, 50, 65, 85, 30, 30, 40, 50, 70, 30, 40, 55, 37, 77, 70, 110, 80, 35, 45, 65, 15, 30, 140, 106, 35, 65, 35, 45, 103, 55, 95, 53, 83, 74, 112, 40, 60, 80, 120, 40, 65, 55, 75, 95, 105, 125, 125, 44, 87, 65, 80, 110, 40, 60, 75, 40, 60, 55, 85, 65, 85, 40, 57, 97, 24, 54, 45, 70, 70)
pokemon[,4] <- c( v4, 45, 75, 105, 85, 125, 65, 95, 145, 30, 40, 60, 60, 70, 95, 40, 100, 81, 55, 95, 60, 35, 55, 40, 60, 40, 37, 57, 45, 55, 105, 48, 45, 65, 125, 50, 135, 90, 72, 90, 125, 110, 125, 145, 150, 120, 115, 105, 130, 120, 170, 129, 129, 128, 77, 120, 48, 56, 74, 62, 90, 114, 62, 83, 103, 32, 50, 40, 56, 74, 27, 27, 90, 73, 109, 61, 75, 112, 62, 97, 46, 69, 65, 63, 83, 83, 35, 45, 150, 50, 63, 99, 59, 85, 37, 68, 39, 54, 60, 97, 58, 120, 61, 109, 45, 69, 67, 99, 110, 74, 81, 50, 55, 83, 110, 80, 50, 65, 44, 44, 44, 44, 58, 58, 58, 58, 32, 44, 45, 97, 131, 131, 81, 100, 160, 150, 170, 130 )
v5 <- c(65, 80, 100, 120, 50, 65, 85, 85, 115, 64, 80, 105, 115, 20, 25, 80, 20, 25, 80, 80, 35, 50, 70, 80, 35, 70, 31, 61, 54, 79, 50, 80, 30, 55, 40, 55, 85, 40, 55, 75, 65, 90, 65, 100, 25, 50, 40, 75, 65, 75, 90, 55, 80, 55, 75, 45, 70, 40, 65, 50, 80, 45, 70, 50, 80, 40, 50, 90, 55, 70, 95, 95, 35, 60, 85, 30, 45, 70, 100, 120, 30, 45, 65, 65, 80, 40, 80, 80, 55, 70, 62, 35, 60, 70, 95, 50, 100, 25, 45, 35, 55, 75, 95, 45, 90, 115, 25, 50, 55, 80, 45, 65, 50, 80, 110, 110, 75, 45, 70, 30, 45, 105, 40, 80, 100, 25, 45, 50, 80, 55, 85, 120, 80, 95, 85, 85, 70, 90, 70, 20, 100, 130, 95, 48, 65, 95, 95, 110, 75, 55, 70, 45, 70, 75, 95, 110, 125, 90, 85, 50, 70, 100, 90, 100, 120, 100, 65, 80, 100, 50, 65, 85, 48, 63, 83, 45, 55, 56, 96, 80, 110, 40, 60, 80, 56, 76, 35, 55, 20, 65, 105, 45, 70, 45, 60, 90, 110, 100, 50, 80, 65, 100, 55, 65, 95, 55, 30, 85, 45, 25, 65, 95, 130, 42, 110, 85, 48, 58, 65, 35, 60, 65, 65, 65, 95, 40, 60, 55, 80, 100, 230, 95, 105, 75, 50, 75, 40, 80, 30, 60, 85, 35, 75, 45, 140, 70, 50, 80, 90, 95, 40, 60, 95, 65, 45, 35, 110, 65, 55, 55, 70, 135, 100, 75, 115, 50, 70, 100, 120, 154, 154, 100, 55, 65, 85, 85, 50, 60, 70, 80, 50, 70, 90, 110, 30, 60, 41, 61, 30, 25, 50, 25, 90, 50, 70, 100, 30, 40, 60, 30, 50, 30, 70, 35, 55, 115, 135, 52, 82, 60, 60, 35, 55, 65, 30, 50, 30, 23, 43, 73, 30, 60, 40, 90, 35, 55, 65, 115, 55, 95, 40, 50, 60, 80, 55, 75, 85, 40, 60, 80, 75, 85, 75, 75, 80, 53, 83, 20, 40, 65, 35, 45, 45, 75, 105, 70, 80, 110, 60, 45, 50, 80, 40, 60, 75, 105, 105, 60, 60, 85, 65, 41, 71, 35, 55, 70, 120, 87, 107, 50, 80, 55, 125, 70, 120, 33, 63, 83, 90, 130, 87, 80, 60, 60, 48, 50, 80, 80, 50, 70, 90, 55, 75, 75, 65, 65, 30, 50, 80, 90, 60, 80, 90, 110, 100, 200, 150, 130, 150, 110, 120, 140, 160, 90, 90, 90, 100, 100, 50, 20, 160, 90, 55, 65, 85, 44, 52, 71, 56, 76, 101, 30, 40, 60, 40, 60, 41, 51, 34, 49, 79, 70, 105, 30, 50, 88, 138, 45, 105, 85, 95, 50, 42, 102, 90, 30, 50, 53, 78, 62, 82, 66, 44, 54, 56, 96, 96, 105, 52, 37, 59, 50, 41, 61, 86, 116, 45, 90, 65, 42, 108, 45, 55, 85, 95, 85, 40, 70, 70, 42, 72, 55, 75, 40, 65, 72, 61, 86, 120, 60, 85, 105, 85, 90, 95, 55, 50, 85, 95, 115, 56, 65, 95, 75, 60, 75, 115, 115, 150, 135, 70, 77, 107, 107, 107, 107, 107, 130, 105, 70, 100, 120, 106, 110, 120, 100, 130, 80, 100, 90, 100, 75, 120, 100, 55, 75, 95, 45, 55, 65, 45, 60, 70, 39, 69, 45, 65, 90, 37, 50, 48, 63, 48, 63, 48, 63, 55, 95, 30, 42, 55, 32, 63, 25, 40, 80, 43, 55, 45, 65, 86, 126, 35, 50, 65, 40, 55, 75, 85, 75, 60, 80, 80, 39, 79, 69, 50, 75, 50, 75, 55, 35, 45, 70, 45, 55, 105, 67, 35, 75, 70, 115, 80, 65, 105, 45, 65, 45, 65, 62, 82, 40, 60, 40, 60, 65, 85, 110, 50, 60, 85, 50, 63, 60, 75, 95, 50, 70, 60, 45, 105, 55, 80, 85, 105, 45, 50, 60, 86, 116, 60, 85, 85, 40)
pokemon[,5] <- c( v5, 70, 80, 55, 95, 55, 60, 90, 40, 50, 70, 40, 80, 135, 65, 60, 99, 50, 60, 90, 50, 80, 40, 70, 95, 50, 75, 65, 95, 66, 48, 50, 70, 90, 55, 105, 72, 90, 129, 80, 90, 80, 80, 120, 100, 80, 80, 90, 90, 100, 90, 90, 128, 77, 95, 45, 58, 75, 60, 70, 100, 44, 56, 71, 36, 77, 38, 52, 69, 25, 30, 50, 54, 66, 79, 98, 154, 57, 81, 48, 71, 90, 60, 81, 81, 37, 49, 50, 150, 65, 89, 57, 75, 46, 75, 56, 86, 60, 123, 63, 89, 43, 94, 45, 59, 63, 92, 130, 63, 67, 150, 75, 113, 150, 87, 60, 82, 55, 55, 55, 55, 75, 75, 75, 75, 35, 46, 40, 80, 98, 98, 95, 150, 110, 130, 130, 90 )
v6 <- c(45, 60, 80, 80, 65, 80, 100, 100, 100, 43, 58, 78, 78, 45, 30, 70, 50, 35, 75, 145, 56, 71, 101, 121, 72, 97, 70, 100, 55, 80, 90, 110, 40, 65, 41, 56, 76, 50, 65, 85, 35, 60, 65, 100, 20, 45, 55, 90, 30, 40, 50, 25, 30, 45, 90, 95, 120, 90, 115, 55, 85, 70, 95, 60, 95, 90, 90, 70, 90, 105, 120, 150, 35, 45, 55, 40, 55, 70, 70, 100, 20, 35, 45, 90, 105, 15, 30, 30, 45, 70, 60, 75, 100, 45, 70, 25, 50, 40, 70, 80, 95, 110, 130, 70, 42, 67, 50, 75, 100, 140, 40, 55, 35, 45, 87, 76, 30, 35, 60, 25, 40, 50, 60, 90, 100, 60, 85, 63, 68, 85, 115, 90, 105, 95, 105, 93, 85, 105, 110, 80, 81, 81, 60, 48, 55, 65, 130, 65, 40, 35, 55, 55, 80, 130, 150, 30, 85, 100, 90, 50, 70, 80, 130, 130, 140, 100, 45, 60, 80, 65, 80, 100, 43, 58, 78, 20, 90, 50, 70, 55, 85, 30, 40, 130, 67, 67, 60, 15, 15, 20, 40, 70, 95, 35, 45, 55, 45, 50, 40, 50, 30, 70, 50, 80, 110, 85, 30, 30, 95, 15, 35, 110, 65, 91, 30, 85, 48, 33, 85, 15, 40, 45, 85, 30, 30, 30, 45, 85, 65, 75, 5, 85, 75, 115, 40, 55, 20, 30, 50, 50, 35, 65, 45, 75, 70, 70, 65, 95, 115, 85, 40, 50, 60, 85, 75, 35, 70, 65, 95, 83, 100, 55, 115, 100, 85, 41, 51, 61, 71, 110, 90, 100, 70, 95, 120, 145, 45, 55, 80, 100, 40, 50, 60, 70, 35, 70, 60, 100, 20, 15, 65, 15, 65, 30, 50, 70, 30, 60, 80, 85, 125, 85, 65, 40, 50, 80, 100, 65, 60, 35, 70, 30, 90, 100, 40, 160, 40, 28, 48, 68, 25, 50, 20, 30, 50, 70, 50, 20, 50, 50, 30, 40, 50, 50, 60, 80, 100, 65, 105, 135, 95, 95, 85, 85, 65, 40, 55, 65, 95, 105, 60, 60, 35, 40, 20, 20, 60, 80, 60, 10, 70, 100, 35, 55, 50, 80, 80, 90, 65, 70, 70, 60, 60, 35, 55, 55, 75, 23, 43, 75, 45, 80, 81, 70, 40, 45, 65, 75, 25, 25, 51, 65, 75, 115, 23, 50, 80, 100, 25, 45, 65, 32, 52, 52, 55, 97, 50, 50, 100, 120, 30, 50, 70, 110, 50, 50, 50, 110, 110, 110, 110, 90, 90, 90, 90, 95, 115, 100, 150, 150, 90, 180, 31, 36, 56, 61, 81, 108, 40, 50, 60, 60, 80, 100, 31, 71, 25, 65, 45, 60, 70, 55, 90, 58, 58, 30, 30, 36, 36, 36, 36, 66, 70, 40, 95, 85, 115, 35, 85, 34, 39, 115, 70, 80, 85, 105, 135, 105, 71, 85, 112, 45, 74, 84, 23, 33, 10, 60, 30, 91, 35, 42, 82, 102, 92, 5, 60, 90, 112, 32, 47, 65, 95, 50, 85, 46, 66, 91, 50, 40, 60, 30, 125, 60, 50, 40, 50, 95, 83, 80, 95, 95, 65, 95, 80, 90, 80, 110, 40, 45, 110, 91, 86, 86, 86, 86, 86, 95, 80, 115, 90, 100, 77, 100, 90, 90, 85, 80, 100, 125, 100, 127, 120, 100, 63, 83, 113, 45, 55, 65, 45, 60, 70, 42, 77, 55, 60, 80, 66, 106, 64, 101, 64, 101, 64, 101, 24, 29, 43, 65, 93, 76, 116, 15, 20, 25, 72, 114, 68, 88, 50, 50, 35, 40, 45, 64, 69, 74, 45, 85, 42, 42, 92, 57, 47, 112, 66, 116, 30, 90, 98, 65, 74, 92, 50, 95, 55, 60, 55, 45, 48, 58, 97, 30, 30, 22, 32, 70, 110, 65, 75, 65, 105, 75, 115, 45, 55, 65, 20, 30, 30, 55, 98, 44, 59, 79, 75, 95, 103, 60, 20, 15, 30, 40, 60, 65, 65, 108, 10, 20, 30, 50, 90, 60)
pokemon[,6] <- c( v6, 40, 50, 30, 40, 20, 55, 80, 57, 67, 97, 40, 50, 105, 25, 145, 32, 65, 105, 48, 35, 55, 60, 70, 55, 60, 80, 60, 80, 65, 109, 38, 58, 98, 60, 100, 108, 108, 108, 111, 121, 111, 101, 90, 90, 101, 91, 95, 95, 95, 108, 108, 90, 128, 99, 38, 57, 64, 60, 73, 104, 71, 97, 122, 57, 78, 62, 84, 126, 35, 29, 89, 72, 106, 42, 52, 75, 52, 68, 43, 58, 102, 68, 104, 104, 28, 35, 60, 60, 23, 29, 49, 72, 45, 73, 50, 68, 30, 44, 44, 59, 70, 109, 48, 71, 46, 58, 60, 118, 101, 50, 40, 60, 80, 75, 38, 56, 51, 56, 46, 41, 84, 99, 69, 54, 28, 28, 55, 123, 99, 99, 95, 50, 110, 70, 80, 70 )
colnames(pokemon) <- c("HitPoints", "Attack", "Defense", "SpecialAttack", "SpecialDefense", "Speed")
str(pokemon)
## num [1:800, 1:6] 45 60 80 80 39 58 78 78 78 44 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:6] "HitPoints" "Attack" "Defense" "SpecialAttack" ...
apply(pokemon, 2, FUN=mean)
## HitPoints Attack Defense SpecialAttack SpecialDefense
## 69.25875 79.00125 73.84250 72.82000 71.90250
## Speed
## 68.27750
# Initialize total within sum of squares error: wss
wss <- 0
# Look over 1 to 15 possible clusters
for (i in 1:15) {
# Fit the model: km.out
km.out <- kmeans(pokemon, centers = i, nstart = 20, iter.max = 50)
# Save the within cluster sum of squares
wss[i] <- km.out$tot.withinss
}
# Produce a scree plot
plot(1:15, wss, type = "b",
xlab = "Number of Clusters",
ylab = "Within groups sum of squares")
# Select number of clusters
k <- 3
# Build model with k clusters: km.out
km.out <- kmeans(pokemon, centers = k, nstart = 20, iter.max = 50)
# View the resulting model
km.out
## K-means clustering with 3 clusters of sizes 355, 175, 270
##
## Cluster means:
## HitPoints Attack Defense SpecialAttack SpecialDefense Speed
## 1 54.68732 56.93239 53.64507 52.02254 53.04789 53.58873
## 2 79.30857 97.29714 108.93143 66.71429 87.04571 57.29143
## 3 81.90370 96.15926 77.65556 104.12222 86.87778 94.71111
##
## Clustering vector:
## [1] 1 1 3 3 1 1 3 3 3 1 1 2 3 1 1 1 1 1 1 3 1 1 3 3 1 1 1 3 1 3 1 3 1 2 1
## [36] 1 2 1 1 3 1 3 1 3 1 1 1 3 1 1 3 1 2 1 3 1 1 1 3 1 3 1 3 1 3 1 1 2 1 3
## [71] 3 3 1 2 2 1 1 3 1 3 1 2 2 1 3 1 2 2 1 3 1 1 3 1 2 1 2 1 2 1 3 3 3 2 1
## [106] 2 1 2 1 3 1 3 1 2 2 2 1 1 2 1 2 1 2 2 2 1 3 1 2 1 3 3 3 3 3 3 2 2 2 1
## [141] 2 2 2 1 1 3 3 3 1 1 2 1 2 3 3 2 3 3 3 1 1 3 3 3 3 3 1 1 2 1 1 3 1 1 2
## [176] 1 1 1 3 1 1 1 1 3 1 3 1 1 1 1 1 1 3 1 1 3 3 2 1 1 2 3 1 1 3 1 1 1 1 1
## [211] 2 3 2 1 2 3 1 1 3 1 2 1 2 2 2 1 2 1 2 2 2 2 2 1 1 2 1 2 1 2 1 1 3 1 3
## [246] 2 1 3 3 3 1 2 3 3 1 1 2 1 1 1 2 3 3 3 2 1 1 2 2 3 3 3 1 1 3 3 1 1 3 3
## [281] 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 3 1 1 3 1 1 1 2 1 1 3 3 1 1 1 2 1 1 3 1
## [316] 3 1 1 1 3 1 2 1 2 1 1 1 2 1 2 1 2 2 2 1 1 3 1 3 3 1 1 1 1 1 1 2 1 3 3
## [351] 1 3 1 3 2 2 1 3 1 1 1 3 1 3 1 2 3 3 3 3 2 1 2 1 2 1 2 1 2 1 2 1 3 1 2
## [386] 1 3 3 1 2 2 1 3 3 1 1 3 3 1 1 3 1 2 2 2 1 1 2 3 3 1 2 2 3 2 2 2 3 3 3
## [421] 3 3 3 2 3 3 3 3 3 3 2 3 1 2 2 1 1 3 1 1 3 1 1 3 1 1 1 1 1 1 3 1 3 1 2
## [456] 1 2 1 2 2 2 3 1 2 1 1 3 1 3 1 2 3 1 3 1 3 3 3 3 1 3 1 1 3 1 2 1 1 1 1
## [491] 2 1 1 3 3 1 1 3 3 1 2 1 2 1 3 2 1 3 1 1 3 2 3 3 2 2 2 3 3 3 3 2 3 2 3
## [526] 3 3 3 2 2 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 1 1 3 1 1 3
## [561] 1 1 3 1 1 1 1 2 1 3 1 3 1 3 1 3 1 2 1 1 3 1 3 1 2 2 1 3 1 3 2 2 1 2 2
## [596] 1 1 3 2 2 1 1 3 1 1 3 1 3 1 3 3 1 1 3 1 2 3 3 1 2 1 2 3 1 2 1 2 1 3 1
## [631] 2 1 3 1 3 1 1 2 1 1 3 1 3 1 1 3 1 3 3 1 2 1 2 1 3 2 1 3 1 2 1 2 2 1 1
## [666] 3 1 3 1 1 3 1 2 2 1 2 3 1 3 2 1 3 2 1 2 1 2 2 1 2 1 2 3 2 1 1 3 1 3 3
## [701] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 2 2 1 1 3 1 1 3 1 1 1 1 3 1 1 1
## [736] 1 3 1 1 3 1 3 1 2 3 1 3 3 1 2 3 2 1 2 1 3 1 2 1 2 1 2 1 3 1 3 1 2 1 3
## [771] 3 3 3 2 1 3 3 2 1 2 1 1 1 1 2 2 2 2 1 2 1 3 3 3 2 2 3 3 3 3
##
## Within cluster sum of squares by cluster:
## [1] 812079.9 709020.5 1018348.0
## (between_SS / total_SS = 40.8 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
# Plot of Defense vs. Speed by cluster membership
plot(pokemon[, c("Defense", "Speed")],
col = km.out$cluster,
main = paste("k-means clustering of Pokemon with", k, "clusters"),
xlab = "Defense", ylab = "Speed")
Chapter 2 - Hierarchical Clustering
Introduction to hierarchical clustering - creating clusters when the number of clusters is not known ahead of time:
Selecting the number of clusters - dendrograms (trees):
Clustering linkage and practical matters - how to determine distances between clusters:
Example code includes:
x <- matrix(data=NA, nrow=50, ncol=2)
x[, 1] <- c( 3.37, 1.44, 2.36, 2.63, 2.4, 1.89, 3.51, 1.91, 4.02, 1.94, 3.3, 4.29, 0.61, 1.72,
1.87, 2.64, 1.72, -0.66, -0.44, 3.32, 1.69, 0.22, 1.83, 3.21, 3.9, -5.43, -5.26,
-6.76, -4.54, -5.64, -4.54, -4.3, -3.96, -5.61, -4.5, -1.72, -0.78, -0.85, -2.41,
0.04, 0.21, -0.36, 0.76, -0.73, -1.37, 0.43, -0.81, 1.44, -0.43, 0.66
)
x[, 2] <- c( 2.32, 1.22, 3.58, 2.64, 2.09, 2.28, 2.68, 2.09, -0.99, 2.28, 1.63, 2.19, 2.58, 3.4,
1.27, 3.3, 2.34, 3.04, 2.92, 2.72, 0.96, 1.91, 2.62, 1.05, 1.46, 2.58, 2.77, 2.46,
1.11, 0.9, 3.51, 2.26, 2.09, 1.88, 0.81, -1.39, -2.22, -2.18, -1.07, -1.18, -0.61,
-2.48, -1.35, -0.61, -3.11, -2.86, -3.13, -3.46, -1.92, -1.35
)
str(x)
## num [1:50, 1:2] 3.37 1.44 2.36 2.63 2.4 1.89 3.51 1.91 4.02 1.94 ...
# Create hierarchical clustering model: hclust.out
hclust.out <- hclust(d=dist(x))
# Inspect the result
summary(hclust.out)
## Length Class Mode
## merge 98 -none- numeric
## height 49 -none- numeric
## order 50 -none- numeric
## labels 0 -none- NULL
## method 1 -none- character
## call 2 -none- call
## dist.method 1 -none- character
# Cut by height
cutree(hclust.out, h=7)
## [1] 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3
## [36] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
# Cut by number of clusters
cutree(hclust.out, k=3)
## [1] 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3
## [36] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
# Cluster using complete linkage: hclust.complete
hclust.complete <- hclust(dist(x), method="complete")
# Cluster using average linkage: hclust.average
hclust.average <- hclust(dist(x), method="average")
# Cluster using single linkage: hclust.single
hclust.single <- hclust(dist(x), method="single")
# Plot dendrogram of hclust.complete
plot(hclust.complete)
# Plot dendrogram of hclust.average
plot(hclust.average)
# Plot dendrogram of hclust.single
plot(hclust.single)
# View column means
colMeans(pokemon)
## HitPoints Attack Defense SpecialAttack SpecialDefense
## 69.25875 79.00125 73.84250 72.82000 71.90250
## Speed
## 68.27750
# View column standard deviations
apply(pokemon, 2, FUN=sd)
## HitPoints Attack Defense SpecialAttack SpecialDefense
## 25.53467 32.45737 31.18350 32.72229 27.82892
## Speed
## 29.06047
# Scale the data
pokemon.scaled <- scale(pokemon)
# Create hierarchical clustering model: hclust.pokemon
hclust.pokemon <- hclust(dist(pokemon.scaled), method="complete")
Chapter 3 - Dimensionality Reduction with PCA
Introduction to PCA - a popular type of dimensionality reduction to find structure in features, and aid in visualization:
Visualizing and intepreting PCA results:
Practical issues with PCA - scaling, missing values (drop and/or impute), categorical data (drop or encode as numbers):
Example code includes:
pokemon <- matrix(nrow=50, ncol=4, byrow=FALSE,
data=c( 58, 90, 70, 60, 60, 44, 100, 80, 80, 60, 150, 62, 75, 70, 115, 74, 74,
40, 95, 80, 25, 51, 48, 45, 35, 20, 60, 70, 70, 80, 57, 64, 75, 101, 50,
60, 85, 95, 58, 100, 95, 91, 62, 70, 60, 70, 50, 50, 70, 150, 64, 100,
94, 80, 55, 38, 77, 145, 100, 55, 100, 77, 98, 130, 45, 108, 94, 35, 65,
120, 35, 65, 72, 45, 55, 40, 70, 20, 55, 100, 24, 78, 98, 72, 75, 100, 120,
155, 89, 150, 125, 90, 48, 40, 110, 85, 85, 50, 110, 120, 58, 70, 50, 110,
90, 33, 77, 150, 70, 145, 120, 62, 63, 100, 20, 133, 131, 30, 65, 130, 70,
65, 48, 55, 40, 90, 50, 50, 65, 80, 86, 52, 63, 72, 70, 89, 70, 109, 77,
120, 79, 129, 54, 50, 70, 140, 40, 62, 70, 100, 80, 80, 66, 45, 80, 70,
90, 110, 95, 40, 90, 65, 101, 65, 20, 32, 20, 105, 60, 45, 45, 59, 48, 63,
60, 25, 65, 40, 70, 100, 23, 81, 101, 29, 48, 112, 100, 81, 48, 90, 81,
108, 68, 25, 100, 20, 35, 65, 90, 90
)
)
colnames(pokemon) <- c( "HitPoint", "Attack", "Defense", "Speed" )
rownames(pokemon) <- c( 'Quilava', 'Goodra', 'Mothim', 'Marowak', 'Chandelure', 'Helioptile',
'MeloettaAria Forme', 'MetagrossMega Metagross', 'Sawsbuck', 'Probopass',
'GiratinaAltered Forme', 'Tranquill', 'Simisage', 'Scizor', 'Jigglypuff',
'Carracosta', 'Ferrothorn', 'Kadabra', 'Sylveon', 'Golem', 'Magnemite',
'Vanillish', 'Unown', 'Snivy', 'Tynamo', 'Duskull', 'Beautifly', 'Marill',
'Lunatone', 'Flygon', 'Bronzor', 'Monferno', 'Simisear', 'Aromatisse',
'Scraggy', 'Scolipede', 'Staraptor', 'GyaradosMega Gyarados', 'Tyrunt', 'Zekrom',
'Gyarados', 'Cobalion', 'Espurr', 'Spheal', 'Dodrio', 'Torkoal', 'Cacnea',
'Trubbish', 'Lucario', 'GiratinaOrigin Forme'
)
str(pokemon)
## num [1:50, 1:4] 58 90 70 60 60 44 100 80 80 60 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:50] "Quilava" "Goodra" "Mothim" "Marowak" ...
## ..$ : chr [1:4] "HitPoint" "Attack" "Defense" "Speed"
colMeans(pokemon)
## HitPoint Attack Defense Speed
## 71.08 81.22 78.44 66.58
head(pokemon)
## HitPoint Attack Defense Speed
## Quilava 58 64 58 80
## Goodra 90 100 70 80
## Mothim 70 94 50 66
## Marowak 60 80 110 45
## Chandelure 60 55 90 80
## Helioptile 44 38 33 70
# Perform scaled PCA: pr.out
pr.out <- prcomp(pokemon, scale=TRUE)
# Inspect model output
summary(pr.out)
## Importance of components:
## PC1 PC2 PC3 PC4
## Standard deviation 1.4420 1.0013 0.7941 0.53595
## Proportion of Variance 0.5199 0.2507 0.1577 0.07181
## Cumulative Proportion 0.5199 0.7705 0.9282 1.00000
biplot(pr.out)
# Variability of each principal component: pr.var
pr.var <- (pr.out$sdev)^2
# Variance explained by each principal component: pve
pve <- pr.var / sum(pr.var)
# Plot variance explained for each principal component
plot(pve, xlab = "Principal Component",
ylab = "Proportion of Variance Explained",
ylim = c(0, 1), type = "b")
# Plot cumulative proportion of variance explained
plot(cumsum(pve), xlab = "Principal Component",
ylab = "Cummulative Proportion of Variance Explained",
ylim = c(0, 1), type = "b")
pokeTotal <- matrix(ncol=1, nrow=50,
data=c( 405, 600, 424, 425, 520, 289, 600, 700, 475, 525, 680, 358, 498, 500,
270, 495, 489, 400, 525, 495, 325, 395, 336, 308, 275, 295, 395, 250,
440, 520, 300, 405, 498, 462, 348, 485, 485, 640, 362, 680, 540, 580,
355, 290, 460, 470, 335, 329, 525, 680
)
)
pokemon <- cbind(pokeTotal, pokemon)
colnames(pokemon)[1] <- "Total"
str(pokemon)
## num [1:50, 1:5] 405 600 424 425 520 289 600 700 475 525 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:50] "Quilava" "Goodra" "Mothim" "Marowak" ...
## ..$ : chr [1:5] "Total" "HitPoint" "Attack" "Defense" ...
colMeans(pokemon)
## Total HitPoint Attack Defense Speed
## 448.82 71.08 81.22 78.44 66.58
# Mean of each variable
colMeans(pokemon)
## Total HitPoint Attack Defense Speed
## 448.82 71.08 81.22 78.44 66.58
# Standard deviation of each variable
apply(pokemon, 2, sd)
## Total HitPoint Attack Defense Speed
## 119.32321 25.62193 33.03078 32.05809 27.51036
# PCA model with scaling: pr.with.scaling
pr.with.scaling <- prcomp(pokemon, scale=TRUE)
# PCA model without scaling: pr.without.scaling
pr.without.scaling <- prcomp(pokemon, scale=FALSE)
# Create biplots of both for comparison
biplot(pr.with.scaling)
biplot(pr.without.scaling)
Chapter 4 - Case Study
Introduction to the case study:
PCA Review and Next Steps:
Example code includes:
# Cached to avoid repeated downloads
url <- "http://s3.amazonaws.com/assets.datacamp.com/production/course_1903/datasets/WisconsinCancer.csv"
# Download the data: wisc.df
wisc.df <- read.csv(url, stringsAsFactors=FALSE)
# Convert the features of the data: wisc.data
wisc.data <- as.matrix(wisc.df[, 3:32])
# Set the row names of wisc.data
row.names(wisc.data) <- wisc.df$id
# Create diagnosis vector
diagnosis <- as.numeric(wisc.df$diagnosis == "M")
And, continuing with:
# Check column means and standard deviations
colMeans(wisc.data)
## radius_mean texture_mean perimeter_mean
## 1.412729e+01 1.928965e+01 9.196903e+01
## area_mean smoothness_mean compactness_mean
## 6.548891e+02 9.636028e-02 1.043410e-01
## concavity_mean concave.points_mean symmetry_mean
## 8.879932e-02 4.891915e-02 1.811619e-01
## fractal_dimension_mean radius_se texture_se
## 6.279761e-02 4.051721e-01 1.216853e+00
## perimeter_se area_se smoothness_se
## 2.866059e+00 4.033708e+01 7.040979e-03
## compactness_se concavity_se concave.points_se
## 2.547814e-02 3.189372e-02 1.179614e-02
## symmetry_se fractal_dimension_se radius_worst
## 2.054230e-02 3.794904e-03 1.626919e+01
## texture_worst perimeter_worst area_worst
## 2.567722e+01 1.072612e+02 8.805831e+02
## smoothness_worst compactness_worst concavity_worst
## 1.323686e-01 2.542650e-01 2.721885e-01
## concave.points_worst symmetry_worst fractal_dimension_worst
## 1.146062e-01 2.900756e-01 8.394582e-02
apply(wisc.data, 2, FUN=sd)
## radius_mean texture_mean perimeter_mean
## 3.524049e+00 4.301036e+00 2.429898e+01
## area_mean smoothness_mean compactness_mean
## 3.519141e+02 1.406413e-02 5.281276e-02
## concavity_mean concave.points_mean symmetry_mean
## 7.971981e-02 3.880284e-02 2.741428e-02
## fractal_dimension_mean radius_se texture_se
## 7.060363e-03 2.773127e-01 5.516484e-01
## perimeter_se area_se smoothness_se
## 2.021855e+00 4.549101e+01 3.002518e-03
## compactness_se concavity_se concave.points_se
## 1.790818e-02 3.018606e-02 6.170285e-03
## symmetry_se fractal_dimension_se radius_worst
## 8.266372e-03 2.646071e-03 4.833242e+00
## texture_worst perimeter_worst area_worst
## 6.146258e+00 3.360254e+01 5.693570e+02
## smoothness_worst compactness_worst concavity_worst
## 2.283243e-02 1.573365e-01 2.086243e-01
## concave.points_worst symmetry_worst fractal_dimension_worst
## 6.573234e-02 6.186747e-02 1.806127e-02
# Execute PCA, scaling if appropriate: wisc.pr
wisc.pr <- prcomp(wisc.data, scale=TRUE)
# Look at summary of results
summary(wisc.pr)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 3.6444 2.3857 1.67867 1.40735 1.28403 1.09880
## Proportion of Variance 0.4427 0.1897 0.09393 0.06602 0.05496 0.04025
## Cumulative Proportion 0.4427 0.6324 0.72636 0.79239 0.84734 0.88759
## PC7 PC8 PC9 PC10 PC11 PC12
## Standard deviation 0.82172 0.69037 0.6457 0.59219 0.5421 0.51104
## Proportion of Variance 0.02251 0.01589 0.0139 0.01169 0.0098 0.00871
## Cumulative Proportion 0.91010 0.92598 0.9399 0.95157 0.9614 0.97007
## PC13 PC14 PC15 PC16 PC17 PC18
## Standard deviation 0.49128 0.39624 0.30681 0.28260 0.24372 0.22939
## Proportion of Variance 0.00805 0.00523 0.00314 0.00266 0.00198 0.00175
## Cumulative Proportion 0.97812 0.98335 0.98649 0.98915 0.99113 0.99288
## PC19 PC20 PC21 PC22 PC23 PC24
## Standard deviation 0.22244 0.17652 0.1731 0.16565 0.15602 0.1344
## Proportion of Variance 0.00165 0.00104 0.0010 0.00091 0.00081 0.0006
## Cumulative Proportion 0.99453 0.99557 0.9966 0.99749 0.99830 0.9989
## PC25 PC26 PC27 PC28 PC29 PC30
## Standard deviation 0.12442 0.09043 0.08307 0.03987 0.02736 0.01153
## Proportion of Variance 0.00052 0.00027 0.00023 0.00005 0.00002 0.00000
## Cumulative Proportion 0.99942 0.99969 0.99992 0.99997 1.00000 1.00000
# Create a biplot of wisc.pr
biplot(wisc.pr)
# Scatter plot observations by components 1 and 2
plot(wisc.pr$x[, c(1, 2)], col = (diagnosis + 1),
xlab = "PC1", ylab = "PC2")
# Repeat for components 1 and 3
plot(wisc.pr$x[, c(1, 3)], col = (diagnosis + 1),
xlab = "PC1", ylab = "PC3")
par(mfrow = c(1, 2))
# Calculate variability of each component
pr.var <- (wisc.pr$sdev)^2
# Variance explained by each principal component: pve
pve <- pr.var / sum(pr.var)
# Plot variance explained for each principal component
plot(pve, xlab = "Principal Component",
ylab = "Proportion of Variance Explained",
ylim = c(0, 1), type = "b")
# Plot cumulative proportion of variance explained
plot(cumsum(pve), xlab = "Principal Component",
ylab = "Cummulative Proportion of Variance Explained",
ylim = c(0, 1), type = "b")
par(mfrow = c(1, 1))
# Scale the wisc.data data: data.scaled
data.scaled <- scale(wisc.data)
# Calculate the (Euclidean) distances: data.dist
data.dist <- dist(data.scaled)
# Create a hierarchical clustering model: wisc.hclust
wisc.hclust <- hclust(data.dist, method="complete")
# Cut tree so that it has 4 clusters: wisc.hclust.clusters
wisc.hclust.clusters <- cutree(wisc.hclust, k=4)
# Compare cluster membership to actual diagnoses
table(wisc.hclust.clusters, diagnosis)
## diagnosis
## wisc.hclust.clusters 0 1
## 1 12 165
## 2 2 5
## 3 343 40
## 4 0 2
# Create a k-means model on wisc.data: wisc.km
wisc.km <- kmeans(scale(wisc.data), centers=2, nstart=20)
# Compare k-means to actual diagnoses
table(wisc.km$cluster, diagnosis)
## diagnosis
## 0 1
## 1 14 175
## 2 343 37
# Compare k-means to hierarchical clustering
table(wisc.km$cluster, wisc.hclust.clusters)
## wisc.hclust.clusters
## 1 2 3 4
## 1 160 7 20 2
## 2 17 0 363 0
# Create a hierarchical clustering model: wisc.pr.hclust
wisc.pr.hclust <- hclust(dist(wisc.pr$x[, 1:7]), method = "complete")
# Cut model into 4 clusters: wisc.pr.hclust.clusters
wisc.pr.hclust.clusters <- cutree(wisc.pr.hclust, k=4)
# Compare to actual diagnoses
table(wisc.pr.hclust.clusters, diagnosis)
## diagnosis
## wisc.pr.hclust.clusters 0 1
## 1 5 113
## 2 350 97
## 3 2 0
## 4 0 2
# Compare to k-means and hierarchical
table(wisc.km$cluster, diagnosis)
## diagnosis
## 0 1
## 1 14 175
## 2 343 37
table(wisc.hclust.clusters, diagnosis)
## diagnosis
## wisc.hclust.clusters 0 1
## 1 12 165
## 2 2 5
## 3 343 40
## 4 0 2
Chapter 1 - Regression Models: Fitting and Training
Max Kuhn, author of the caret package for supervised learning:
Out-of-sample error measurement - Zach Mayer, co-author of the caret package:
Cross-validation - improved approach of taking multiple test/train and averaging out-of-sample error rates:
Example code includes:
data(diamonds, package="ggplot2")
str(diamonds)
## Classes 'tbl_df', 'tbl' and 'data.frame': 53940 obs. of 10 variables:
## $ carat : num 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
## $ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
## $ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
## $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
## $ depth : num 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
## $ table : num 55 61 65 58 58 57 57 55 61 61 ...
## $ price : int 326 326 327 334 335 336 336 337 337 338 ...
## $ x : num 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
## $ y : num 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
## $ z : num 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
# Fit lm model: model
model <- lm(price ~ ., data=diamonds)
# Predict on full data: p
p <- predict(model)
# Compute errors: error
error <- p - diamonds$price
# Calculate RMSE
sqrt(mean(error^2))
## [1] 1129.843
# Shuffle row indices: rows
rows <- sample(nrow(diamonds), replace=FALSE)
# Randomly order data
diamonds <- diamonds[rows, ]
# Determine row to split on: split
split <- round(nrow(diamonds) * 0.8)
# Create train
train <- diamonds[1:split, ]
# Create test
test <- diamonds[-(1:split), ]
# Fit lm model on train: model
model <- lm(price ~ ., data=train)
# Predict on test: p
p <- predict(model, newdata=test)
# Compute errors: error
error <- p - test$price
# Calculate RMSE
sqrt(mean(error^2))
## [1] 1119.8
# Fit lm model using 10-fold CV: model
model <- caret::train(
price ~ ., data=diamonds,
method = "lm",
trControl = caret::trainControl(
method = "cv", number = 10,
verboseIter = TRUE
)
)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
## + Fold01: intercept=TRUE
## - Fold01: intercept=TRUE
## + Fold02: intercept=TRUE
## - Fold02: intercept=TRUE
## + Fold03: intercept=TRUE
## - Fold03: intercept=TRUE
## + Fold04: intercept=TRUE
## - Fold04: intercept=TRUE
## + Fold05: intercept=TRUE
## - Fold05: intercept=TRUE
## + Fold06: intercept=TRUE
## - Fold06: intercept=TRUE
## + Fold07: intercept=TRUE
## - Fold07: intercept=TRUE
## + Fold08: intercept=TRUE
## - Fold08: intercept=TRUE
## + Fold09: intercept=TRUE
## - Fold09: intercept=TRUE
## + Fold10: intercept=TRUE
## - Fold10: intercept=TRUE
## Aggregating results
## Fitting final model on full training set
# Print model to console
model
## Linear Regression
##
## 53940 samples
## 9 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 48546, 48545, 48545, 48545, 48546, 48546, ...
## Resampling results:
##
## RMSE Rsquared
## 1130.963 0.919703
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
data(BostonHousing, package="mlbench")
Boston <- BostonHousing
str(Boston)
## 'data.frame': 506 obs. of 14 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : num 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ b : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
# Fit lm model using 5-fold CV: model
model <- caret::train(
medv ~ ., data=Boston,
method = "lm",
trControl = caret::trainControl(
method = "cv", number = 5,
verboseIter = TRUE
)
)
## + Fold1: intercept=TRUE
## - Fold1: intercept=TRUE
## + Fold2: intercept=TRUE
## - Fold2: intercept=TRUE
## + Fold3: intercept=TRUE
## - Fold3: intercept=TRUE
## + Fold4: intercept=TRUE
## - Fold4: intercept=TRUE
## + Fold5: intercept=TRUE
## - Fold5: intercept=TRUE
## Aggregating results
## Fitting final model on full training set
# Print model to console
model
## Linear Regression
##
## 506 samples
## 13 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 405, 407, 404, 404, 404
## Resampling results:
##
## RMSE Rsquared
## 4.904787 0.7223226
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
# Fit lm model using 5 x 5-fold CV: model
model <- train(
medv ~ ., Boston,
method = "lm",
trControl = trainControl(
method = "cv", number = 5,
repeats = 5, verboseIter = TRUE
)
)
## + Fold1: intercept=TRUE
## - Fold1: intercept=TRUE
## + Fold2: intercept=TRUE
## - Fold2: intercept=TRUE
## + Fold3: intercept=TRUE
## - Fold3: intercept=TRUE
## + Fold4: intercept=TRUE
## - Fold4: intercept=TRUE
## + Fold5: intercept=TRUE
## - Fold5: intercept=TRUE
## Aggregating results
## Fitting final model on full training set
# Print model to console
model
## Linear Regression
##
## 506 samples
## 13 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 405, 404, 406, 404, 405
## Resampling results:
##
## RMSE Rsquared
## 4.885857 0.7205083
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
# Predict on full Boston dataset
predict(model, newdata=Boston)
## 1 2 3 4 5 6
## 30.0038434 25.0255624 30.5675967 28.6070365 27.9435242 25.2562845
## 7 8 9 10 11 12
## 23.0018083 19.5359884 11.5236369 18.9202621 18.9994965 21.5867957
## 13 14 15 16 17 18
## 20.9065215 19.5529028 19.2834821 19.2974832 20.5275098 16.9114013
## 19 20 21 22 23 24
## 16.1780111 18.4061360 12.5238575 17.6710367 15.8328813 13.8062853
## 25 26 27 28 29 30
## 15.6783383 13.3866856 15.4639765 14.7084743 19.5473729 20.8764282
## 31 32 33 34 35 36
## 11.4551176 18.0592329 8.8110574 14.2827581 13.7067589 23.8146353
## 37 38 39 40 41 42
## 22.3419371 23.1089114 22.9150261 31.3576257 34.2151023 28.0205641
## 43 44 45 46 47 48
## 25.2038663 24.6097927 22.9414918 22.0966982 20.4232003 18.0365509
## 49 50 51 52 53 54
## 9.1065538 17.2060775 21.2815254 23.9722228 27.6558508 24.0490181
## 55 56 57 58 59 60
## 15.3618477 31.1526495 24.8568698 33.1091981 21.7753799 21.0849356
## 61 62 63 64 65 66
## 17.8725804 18.5111021 23.9874286 22.5540887 23.3730864 30.3614836
## 67 68 69 70 71 72
## 25.5305651 21.1133856 17.4215379 20.7848363 25.2014886 21.7426577
## 73 74 75 76 77 78
## 24.5574496 24.0429571 25.5049972 23.9669302 22.9454540 23.3569982
## 79 80 81 82 83 84
## 21.2619827 22.4281737 28.4057697 26.9948609 26.0357630 25.0587348
## 85 86 87 88 89 90
## 24.7845667 27.7904920 22.1685342 25.8927642 30.6746183 30.8311062
## 91 92 93 94 95 96
## 27.1190194 27.4126673 28.9412276 29.0810555 27.0397736 28.6245995
## 97 98 99 100 101 102
## 24.7274498 35.7815952 35.1145459 32.2510280 24.5802202 25.5941347
## 103 104 105 106 107 108
## 19.7901368 20.3116713 21.4348259 18.5399401 17.1875599 20.7504903
## 109 110 111 112 113 114
## 22.6482911 19.7720367 20.6496586 26.5258674 20.7732364 20.7154831
## 115 116 117 118 119 120
## 25.1720888 20.4302559 23.3772463 23.6904326 20.3357836 20.7918087
## 121 122 123 124 125 126
## 21.9163207 22.4710778 20.5573856 16.3666198 20.5609982 22.4817845
## 127 128 129 130 131 132
## 14.6170663 15.1787668 18.9386859 14.0557329 20.0352740 19.4101340
## 133 134 135 136 137 138
## 20.0619157 15.7580767 13.2564524 17.2627773 15.8784188 19.3616395
## 139 140 141 142 143 144
## 13.8148390 16.4488147 13.5714193 3.9888551 14.5949548 12.1488148
## 145 146 147 148 149 150
## 8.7282236 12.0358534 15.8208206 8.5149902 9.7184414 14.8045137
## 151 152 153 154 155 156
## 20.8385815 18.3010117 20.1228256 17.2860189 22.3660023 20.1037592
## 157 158 159 160 161 162
## 13.6212589 33.2598270 29.0301727 25.5675277 32.7082767 36.7746701
## 163 164 165 166 167 168
## 40.5576584 41.8472817 24.7886738 25.3788924 37.2034745 23.0874875
## 169 170 171 172 173 174
## 26.4027396 26.6538211 22.5551466 24.2908281 22.9765722 29.0719431
## 175 176 177 178 179 180
## 26.5219434 30.7220906 25.6166931 29.1374098 31.4357197 32.9223157
## 181 182 183 184 185 186
## 34.7244046 27.7655211 33.8878732 30.9923804 22.7182001 24.7664781
## 187 188 189 190 191 192
## 35.8849723 33.4247672 32.4119915 34.5150995 30.7610949 30.2893414
## 193 194 195 196 197 198
## 32.9191871 32.1126077 31.5587100 40.8455572 36.1277008 32.6692081
## 199 200 201 202 203 204
## 34.7046912 30.0934516 30.6439391 29.2871950 37.0714839 42.0319312
## 205 206 207 208 209 210
## 43.1894984 22.6903480 23.6828471 17.8544721 23.4942899 17.0058772
## 211 212 213 214 215 216
## 22.3925110 17.0604275 22.7389292 25.2194255 11.1191674 24.5104915
## 217 218 219 220 221 222
## 26.6033477 28.3551871 24.9152546 29.6865277 33.1841975 23.7745666
## 223 224 225 226 227 228
## 32.1405196 29.7458199 38.3710245 39.8146187 37.5860575 32.3995325
## 229 230 231 232 233 234
## 35.4566524 31.2341151 24.4844923 33.2883729 38.0481048 37.1632863
## 235 236 237 238 239 240
## 31.7138352 25.2670557 30.1001074 32.7198716 28.4271706 28.4294068
## 241 242 243 244 245 246
## 27.2937594 23.7426248 24.1200789 27.4020841 16.3285756 13.3989126
## 247 248 249 250 251 252
## 20.0163878 19.8618443 21.2883131 24.0798915 24.2063355 25.0421582
## 253 254 255 256 257 258
## 24.9196401 29.9456337 23.9722832 21.6958089 37.5110924 43.3023904
## 259 260 261 262 263 264
## 36.4836142 34.9898859 34.8121151 37.1663133 40.9892850 34.4463409
## 265 266 267 268 269 270
## 35.8339755 28.2457430 31.2267359 40.8395575 39.3179239 25.7081791
## 271 272 273 274 275 276
## 22.3029553 27.2034097 28.5116947 35.4767660 36.1063916 33.7966827
## 277 278 279 280 281 282
## 35.6108586 34.8399338 30.3519266 35.3098070 38.7975697 34.3312319
## 283 284 285 286 287 288
## 40.3396307 44.6730834 31.5968909 27.3565923 20.1017415 27.0420667
## 289 290 291 292 293 294
## 27.2136458 26.9139584 33.4356331 34.4034963 31.8333982 25.8178324
## 295 296 297 298 299 300
## 24.4298235 28.4576434 27.3626700 19.5392876 29.1130984 31.9105461
## 301 302 303 304 305 306
## 30.7715945 28.9427587 28.8819102 32.7988723 33.2090546 30.7683179
## 307 308 309 310 311 312
## 35.5622686 32.7090512 28.6424424 23.5896583 18.5426690 26.8788984
## 313 314 315 316 317 318
## 23.2813398 25.5458025 25.4812006 20.5390990 17.6157257 18.3758169
## 319 320 321 322 323 324
## 24.2907028 21.3252904 24.8868224 24.8693728 22.8695245 19.4512379
## 325 326 327 328 329 330
## 25.1178340 24.6678691 23.6807618 19.3408962 21.1741811 24.2524907
## 331 332 333 334 335 336
## 21.5926089 19.9844661 23.3388800 22.1406069 21.5550993 20.6187291
## 337 338 339 340 341 342
## 20.1609718 19.2849039 22.1667232 21.2496577 21.4293931 30.3278880
## 343 344 345 346 347 348
## 22.0473498 27.7064791 28.5479412 16.5450112 14.7835964 25.2738008
## 349 350 351 352 353 354
## 27.5420512 22.1483756 20.4594409 20.5460542 16.8806383 25.4025351
## 355 356 357 358 359 360
## 14.3248663 16.5948846 19.6370469 22.7180661 22.2021889 19.2054806
## 361 362 363 364 365 366
## 22.6661611 18.9319262 18.2284680 20.2315081 37.4944739 14.2819073
## 367 368 369 370 371 372
## 15.5428625 10.8316232 23.8007290 32.6440736 34.6068404 24.9433133
## 373 374 375 376 377 378
## 25.9998091 6.1263250 0.7777981 25.3071306 17.7406106 20.2327441
## 379 380 381 382 383 384
## 15.8333130 16.8351259 14.3699483 18.4768283 13.4276828 13.0617751
## 385 386 387 388 389 390
## 3.2791812 8.0602217 6.1284220 5.6186481 6.4519857 14.2076474
## 391 392 393 394 395 396
## 17.2122518 17.2988727 9.8911664 20.2212419 17.9418118 20.3044578
## 397 398 399 400 401 402
## 19.2955908 16.3363278 6.5516232 10.8901678 11.8814587 17.8117451
## 403 404 405 406 407 408
## 18.2612659 12.9794878 7.3781636 8.2111586 8.0662619 19.9829479
## 409 410 411 412 413 414
## 13.7075637 19.8526845 15.2230830 16.9607198 1.7185181 11.8057839
## 415 416 417 418 419 420
## -4.2813107 9.5837674 13.3666081 6.8956236 6.1477985 14.6066179
## 421 422 423 424 425 426
## 19.6000267 18.1242748 18.5217713 13.1752861 14.6261762 9.9237498
## 427 428 429 430 431 432
## 16.3459065 14.0751943 14.2575624 13.0423479 18.1595569 18.6955435
## 433 434 435 436 437 438
## 21.5272830 17.0314186 15.9609044 13.3614161 14.5207938 8.8197601
## 439 440 441 442 443 444
## 4.8675110 13.0659131 12.7060970 17.2955806 18.7404850 18.0590103
## 445 446 447 448 449 450
## 11.5147468 11.9740036 17.6834462 18.1269524 17.5183465 17.2274251
## 451 452 453 454 455 456
## 16.5227163 19.4129110 18.5821524 22.4894479 15.2800013 15.8208934
## 457 458 459 460 461 462
## 12.6872558 12.8763379 17.1866853 18.5124761 19.0486053 20.1720893
## 463 464 465 466 467 468
## 19.7740732 22.4294077 20.3191185 17.8861625 14.3747852 16.9477685
## 469 470 471 472 473 474
## 16.9840576 18.5883840 20.1671944 22.9771803 22.4558073 25.5782463
## 475 476 477 478 479 480
## 16.3914763 16.1114628 20.5348160 11.5427274 19.2049630 21.8627639
## 481 482 483 484 485 486
## 23.4687887 27.0988732 28.5699430 21.0839878 19.4551620 22.2222591
## 487 488 489 490 491 492
## 19.6559196 21.3253610 11.8558372 8.2238669 3.6639967 13.7590854
## 493 494 495 496 497 498
## 15.9311855 20.6266205 20.6124941 16.8854196 14.0132079 19.1085414
## 499 500 501 502 503 504
## 21.2980517 18.4549884 20.4687085 23.5333405 22.3757189 27.6274261
## 505 506
## 26.1279668 22.3442123
Chapter 2 - Classification Models
Logistic regression on mlbench::Sonar - classification models for categorical outcomes:
Confusion matrix - predicted outcomes vs. actual reality:
Class probabilities and class predictions - can modify thresholds for declaring positive depending on desired specificity vs. sensitivity:
Receive Operator Criteria - looking at many confusion matrices is time-consuming and non-scientific/systematic:
Area Under the Curve (AUC) - models that are more random will closely follow the diagonal line, while perfect models hit the upper-left corner:
Example code includes:
data(Sonar, package="mlbench")
# Shuffle row indices: rows
rows <- sample(nrow(Sonar), replace=FALSE)
# Randomly order data: Sonar
Sonar <- Sonar[rows, ]
# Identify row to split on: split
split <- round(nrow(Sonar) * 0.6)
# Create train
train <- Sonar[1:split, ]
# Create test
test <- Sonar[-(1:split), ]
# Fit glm model: model
model <- glm(Class ~ ., family="binomial", data=train)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# Predict on test: p
p <- predict(model, newdata=test, type="response")
# Calculate class probabilities: p_class
p_class <- ifelse(p > 0.5, "R", "M")
# Create confusion matrix
caret::confusionMatrix(p_class, test$Class)
## Confusion Matrix and Statistics
##
## Reference
## Prediction M R
## M 26 10
## R 18 29
##
## Accuracy : 0.6627
## 95% CI : (0.5505, 0.7628)
## No Information Rate : 0.5301
## P-Value [Acc > NIR] : 0.009914
##
## Kappa : 0.3306
## Mcnemar's Test P-Value : 0.185877
##
## Sensitivity : 0.5909
## Specificity : 0.7436
## Pos Pred Value : 0.7222
## Neg Pred Value : 0.6170
## Prevalence : 0.5301
## Detection Rate : 0.3133
## Detection Prevalence : 0.4337
## Balanced Accuracy : 0.6672
##
## 'Positive' Class : M
##
# Apply threshold of 0.9: p_class
p_class <- ifelse(p > 0.9, "R", "M")
# Create confusion matrix
caret::confusionMatrix(p_class, test$Class)
## Confusion Matrix and Statistics
##
## Reference
## Prediction M R
## M 29 10
## R 15 29
##
## Accuracy : 0.6988
## 95% CI : (0.5882, 0.7947)
## No Information Rate : 0.5301
## P-Value [Acc > NIR] : 0.001298
##
## Kappa : 0.3998
## Mcnemar's Test P-Value : 0.423711
##
## Sensitivity : 0.6591
## Specificity : 0.7436
## Pos Pred Value : 0.7436
## Neg Pred Value : 0.6591
## Prevalence : 0.5301
## Detection Rate : 0.3494
## Detection Prevalence : 0.4699
## Balanced Accuracy : 0.7013
##
## 'Positive' Class : M
##
# Apply threshold of 0.10: p_class
p_class <- ifelse(p > 0.1, "R", "M")
# Create confusion matrix
confusionMatrix(p_class, test$Class)
## Confusion Matrix and Statistics
##
## Reference
## Prediction M R
## M 26 10
## R 18 29
##
## Accuracy : 0.6627
## 95% CI : (0.5505, 0.7628)
## No Information Rate : 0.5301
## P-Value [Acc > NIR] : 0.009914
##
## Kappa : 0.3306
## Mcnemar's Test P-Value : 0.185877
##
## Sensitivity : 0.5909
## Specificity : 0.7436
## Pos Pred Value : 0.7222
## Neg Pred Value : 0.6170
## Prevalence : 0.5301
## Detection Rate : 0.3133
## Detection Prevalence : 0.4337
## Balanced Accuracy : 0.6672
##
## 'Positive' Class : M
##
# Predict on test: p
p <- predict(model, newdata=test, type="response")
# Make ROC curve
caTools::colAUC(p, test$Class, plotROC=TRUE)
## [,1]
## M vs. R 0.7246503
# Create trainControl object: myControl
myControl <- caret::trainControl(
method = "cv",
number = 10,
summaryFunction = twoClassSummary,
classProbs = TRUE, # IMPORTANT!
verboseIter = TRUE
)
# Train glm with custom trainControl: model
model <- caret::train(Class ~ ., data=Sonar, method="glm", trControl=myControl)
## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was
## not in the result set. ROC will be used instead.
## + Fold01: parameter=none
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## - Fold01: parameter=none
## + Fold02: parameter=none
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## - Fold02: parameter=none
## + Fold03: parameter=none
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## - Fold03: parameter=none
## + Fold04: parameter=none
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## - Fold04: parameter=none
## + Fold05: parameter=none
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## - Fold05: parameter=none
## + Fold06: parameter=none
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## - Fold06: parameter=none
## + Fold07: parameter=none
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## - Fold07: parameter=none
## + Fold08: parameter=none
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## - Fold08: parameter=none
## + Fold09: parameter=none
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## - Fold09: parameter=none
## + Fold10: parameter=none
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## - Fold10: parameter=none
## Aggregating results
## Fitting final model on full training set
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# Print model to console
model
## Generalized Linear Model
##
## 208 samples
## 60 predictor
## 2 classes: 'M', 'R'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 187, 187, 187, 186, 188, 187, ...
## Resampling results:
##
## ROC Sens Spec
## 0.7387879 0.7310606 0.7122222
Chapter 3 - Tuning Model Parameters
Random forests and wine - very robust against over-fitting, and frequently yield very accurate non-linear models:
Explore a wider model space - random forests require tuning (hyper-parameters):
Custom tuning grids - further customization of the tuneGrid data frame (most flexible, complete control of grid-search exploration):
Introducing glmnet - extension of generalized linear model (glm) with built-in variable selection:
Custom tuning grids with glmnet - ability to tune on both alpha and lambda:
Example code includes:
redWine <- read.csv("redWine.csv", sep=";")
str(redWine)
## 'data.frame': 1599 obs. of 12 variables:
## $ fixed.acidity : num 7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
## $ volatile.acidity : num 0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
## $ citric.acid : num 0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
## $ residual.sugar : num 1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
## $ chlorides : num 0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
## $ free.sulfur.dioxide : num 11 25 15 17 11 13 15 15 9 17 ...
## $ total.sulfur.dioxide: num 34 67 54 60 34 40 59 21 18 102 ...
## $ density : num 0.998 0.997 0.997 0.998 0.998 ...
## $ pH : num 3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
## $ sulphates : num 0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
## $ alcohol : num 9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
## $ quality : int 5 5 5 6 5 5 5 7 7 5 ...
whiteWine <- read.csv("whiteWine.csv", sep=";")
str(whiteWine)
## 'data.frame': 4898 obs. of 12 variables:
## $ fixed.acidity : num 7 6.3 8.1 7.2 7.2 8.1 6.2 7 6.3 8.1 ...
## $ volatile.acidity : num 0.27 0.3 0.28 0.23 0.23 0.28 0.32 0.27 0.3 0.22 ...
## $ citric.acid : num 0.36 0.34 0.4 0.32 0.32 0.4 0.16 0.36 0.34 0.43 ...
## $ residual.sugar : num 20.7 1.6 6.9 8.5 8.5 6.9 7 20.7 1.6 1.5 ...
## $ chlorides : num 0.045 0.049 0.05 0.058 0.058 0.05 0.045 0.045 0.049 0.044 ...
## $ free.sulfur.dioxide : num 45 14 30 47 47 30 30 45 14 28 ...
## $ total.sulfur.dioxide: num 170 132 97 186 186 97 136 170 132 129 ...
## $ density : num 1.001 0.994 0.995 0.996 0.996 ...
## $ pH : num 3 3.3 3.26 3.19 3.19 3.26 3.18 3 3.3 3.22 ...
## $ sulphates : num 0.45 0.49 0.44 0.4 0.4 0.44 0.47 0.45 0.49 0.45 ...
## $ alcohol : num 8.8 9.5 10.1 9.9 9.9 10.1 9.6 8.8 9.5 11 ...
## $ quality : int 6 6 6 6 6 6 6 6 6 6 ...
nRed <- 24
nWhite <- 76
wine <- rbind(redWine[sample(1:nrow(redWine), nRed, replace=FALSE), ],
whiteWine[sample(1:nrow(whiteWine), nWhite, replace=FALSE), ]
)
wine$color <- factor(c(rep("red", nRed), rep("white", nWhite)),
levels=c("red", "white")
)
str(wine)
## 'data.frame': 100 obs. of 13 variables:
## $ fixed.acidity : num 7.2 7.9 6.8 4.7 8.5 7.2 7.6 9.2 7.4 6.6 ...
## $ volatile.acidity : num 0.33 0.3 0.36 0.6 0.66 0.38 0.68 0.43 0.6 0.895 ...
## $ citric.acid : num 0.33 0.68 0.32 0.17 0.2 0.3 0.02 0.52 0.26 0.04 ...
## $ residual.sugar : num 1.7 8.3 1.8 2.3 2.1 1.8 1.3 2.3 7.3 2.3 ...
## $ chlorides : num 0.061 0.05 0.067 0.058 0.097 0.073 0.072 0.083 0.07 0.068 ...
## $ free.sulfur.dioxide : num 3 37.5 4 17 23 31 9 14 36 7 ...
## $ total.sulfur.dioxide: num 13 278 8 106 113 70 20 23 121 13 ...
## $ density : num 0.996 0.993 0.993 0.993 0.997 ...
## $ pH : num 3.23 3.01 3.36 3.85 3.13 3.42 3.17 3.35 3.37 3.53 ...
## $ sulphates : num 1.1 0.51 0.55 0.6 0.48 0.59 1.08 0.61 0.49 0.58 ...
## $ alcohol : num 10 12.3 12.8 12.9 9.2 9.5 9.2 11.3 9.4 10.8 ...
## $ quality : int 8 7 7 6 5 6 4 6 5 6 ...
## $ color : Factor w/ 2 levels "red","white": 1 1 1 1 1 1 1 1 1 1 ...
# Fit random forest: model
model <- caret::train(
quality ~ .,
tuneLength = 1,
data = wine, method = "ranger",
trControl = caret::trainControl(method = "cv", number = 5, verboseIter = TRUE)
)
## Loading required package: e1071
## Loading required package: ranger
## + Fold1: mtry=3
## - Fold1: mtry=3
## + Fold2: mtry=3
## - Fold2: mtry=3
## + Fold3: mtry=3
## - Fold3: mtry=3
## + Fold4: mtry=3
## - Fold4: mtry=3
## + Fold5: mtry=3
## - Fold5: mtry=3
## Aggregating results
## Fitting final model on full training set
# Print model to console
model
## Random Forest
##
## 100 samples
## 12 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 79, 80, 81, 80, 80
## Resampling results:
##
## RMSE Rsquared
## 0.9085138 0.2127601
##
## Tuning parameter 'mtry' was held constant at a value of 3
# Fit random forest: model
model <- caret::train(
quality ~ .,
tuneLength = 3,
data = wine, method = "ranger",
trControl = caret::trainControl(method = "cv", number = 5, verboseIter = TRUE)
)
## + Fold1: mtry= 2
## - Fold1: mtry= 2
## + Fold1: mtry= 7
## - Fold1: mtry= 7
## + Fold1: mtry=12
## - Fold1: mtry=12
## + Fold2: mtry= 2
## - Fold2: mtry= 2
## + Fold2: mtry= 7
## - Fold2: mtry= 7
## + Fold2: mtry=12
## - Fold2: mtry=12
## + Fold3: mtry= 2
## - Fold3: mtry= 2
## + Fold3: mtry= 7
## - Fold3: mtry= 7
## + Fold3: mtry=12
## - Fold3: mtry=12
## + Fold4: mtry= 2
## - Fold4: mtry= 2
## + Fold4: mtry= 7
## - Fold4: mtry= 7
## + Fold4: mtry=12
## - Fold4: mtry=12
## + Fold5: mtry= 2
## - Fold5: mtry= 2
## + Fold5: mtry= 7
## - Fold5: mtry= 7
## + Fold5: mtry=12
## - Fold5: mtry=12
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 2 on full training set
# Print model to console
model
## Random Forest
##
## 100 samples
## 12 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 80, 80, 80, 80, 80
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared
## 2 0.8937233 0.2450648
## 7 0.8979628 0.2446744
## 12 0.9186630 0.2179967
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 2.
# Plot model
plot(model)
# Fit random forest: model
model <- caret::train(
quality ~ .,
tuneGrid = data.frame(mtry=c(2, 3, 7)),
data = wine, method = "ranger",
trControl = caret::trainControl(method = "cv", number = 5, verboseIter = TRUE)
)
## + Fold1: mtry=2
## - Fold1: mtry=2
## + Fold1: mtry=3
## - Fold1: mtry=3
## + Fold1: mtry=7
## - Fold1: mtry=7
## + Fold2: mtry=2
## - Fold2: mtry=2
## + Fold2: mtry=3
## - Fold2: mtry=3
## + Fold2: mtry=7
## - Fold2: mtry=7
## + Fold3: mtry=2
## - Fold3: mtry=2
## + Fold3: mtry=3
## - Fold3: mtry=3
## + Fold3: mtry=7
## - Fold3: mtry=7
## + Fold4: mtry=2
## - Fold4: mtry=2
## + Fold4: mtry=3
## - Fold4: mtry=3
## + Fold4: mtry=7
## - Fold4: mtry=7
## + Fold5: mtry=2
## - Fold5: mtry=2
## + Fold5: mtry=3
## - Fold5: mtry=3
## + Fold5: mtry=7
## - Fold5: mtry=7
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 2 on full training set
# Print model to console
model
## Random Forest
##
## 100 samples
## 12 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 80, 80, 80, 81, 79
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared
## 2 0.8460142 0.3049359
## 3 0.8507520 0.2927151
## 7 0.8554196 0.2847809
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 2.
# Plot model
plot(model)
# Create custom trainControl: myControl
myControl <- caret::trainControl(
method = "cv", number = 10,
summaryFunction = twoClassSummary,
classProbs = TRUE, # IMPORTANT!
verboseIter = TRUE
)
## DO NOT HAVE (AND CANNOT FIND) DATASET "overfit"
# Fit glmnet model: model
# model <- caret::train(
# y ~ ., data=overfit,
# method = "glmnet",
# trControl = myControl
# )
# Print model to console
# model
# Print maximum ROC statistic
# max(model$results$ROC)
# Train glmnet with custom trainControl and tuning: model
# model <- caret::train(
# y ~ ., data=overfit,
# tuneGrid = expand.grid(alpha=0:1, lambda=seq(0.0001, 1, length=100)),
# method = "glmnet",
# trControl = myControl
# )
# Print model to console
# model
# Print maximum ROC statistic
# max(model$results$ROC)
Chapter 4 - Pre-processing data
Median imputation - real-world data has missing values which pose problems for many machine learning algorithms:
KNN imputation addresses the concern that median imputation may miss patterns in the NA data:
Multiple pre-processing methods - can do much more than median imputation or kNN imputation:
Handling low-information predictors - some variables may contain very little information (e.g., variables with no/low variance):
Principal Components Analysis (PCA) is especially valuable for linear modelling:
Example code includes:
v1 <- c( 5, NA, NA, 6, 4, 8, 1, 2, NA, NA, 1, 2, 5, 1, NA, 7, 4, 4, 10, NA, 7, 10, 3, 8, NA, 5, 3, 5, 2, 1, 3, 2, 10, 2, 3, 2, NA, 6, 5, NA, 6, 10, 6, 5, 10, NA, 3, 1, 4, NA, 9, 5, 10, 5, 10, 10, 8, 8, 5, 9, NA, 1, NA, 6, 1, 10, 4, 5, 8, 1, 5, 6, 1, 9, 10, 1, 1, 5, NA, 2, 2, 4, 5, 3, 3, 5, NA, 3, 4, 2, 1, 3, 4, 1, 2, 1, NA, 5, NA, 7, 10, 2, 4, NA, 10, 7, NA, 1, 1, 6, 1, 8, NA, 10, 3, 1, NA, 4, 1, 3, 1, 4, 10, 5, 5, 1, 7, 3, 8, NA, 5, 2, 5, NA, 3, 5, 4, 3, 4, 1, 3, 2, 9, 1, NA, 1, 3, 1, 3, 8, 1, 7, NA, NA, 1, 5, 1, 2, 1, 9, 10, NA, 3, 1, 5, 4, 5, 10, 3, 1, 3, 1, 1, 6, 8, 5, 2, 5, NA, 5, 1, 1, 6, 5, NA, 2, 1, 10, 5, 1, NA, 7, 5, 1, NA, 4, 8, 5, NA, 3, NA, 10, 1, 5, 1, 5, 10, 1, 1, 5, 8, 8, 1, 10, 10, 8, 1, 1, 6, 6, 1, 10, NA, 7, 10, 1, 10, 8, 1, 10, 7, 6, 8, NA, 3, 3, 10, 9, 8, 10, NA, 3, NA, 1, NA, 5, 8, 8, 4, 3, 1, 10, 6, 6, 9, 5, NA, 3, NA, 5, 10, 5, 8, NA, 7, 5, 10, NA, 10, 1, 8, 5, 3, 7, 3, 3, NA, NA, 1, 10, 3, 2, NA, 10, 7, 8, 10, 3, 6, 5, 1, 1, 8, NA, 1, 5, NA, 5, 8, NA, 8, 1, 10, 1, 8, NA, 1, 1, 7, 3, 2, 1, NA, 1, 1, 4, NA, 6, 1, 4, NA, 3, 3, NA, 1, 3, 10, NA, 8, 10, 10, NA, 5, 5, 8, 1, 6, 1, 1, 8, 10, 1, 2, 1, 7, 1, 5, 1, 3, 4, 5, 2, NA, 2, 1, 4, 5, 8, 8, 10, 6, NA, 3, 4, 2, 2, 6, 5, 1, 1, NA, 1, 4, 5, NA, 1, 1, NA, 3, NA, 1, 10, 3, 2, 2, 3, 7, NA, 2, 5, 1, 10, 3, 1, 1, 3, 3, NA, 3, NA, 3, 3, 5, 3, 1, 1, 4, 1, 2, NA, 1, 1, 10, 5, 8, 3, 8, 1, 5, 2, 3, 10, 4, 5, NA, 9, 5, NA, 1, 2, 1, 5, 5, 3, 6, 10, 10, NA, 4, NA, NA, 5, 1, 1, 5, NA, 1, 5, 1, 5, 4, 5, 3, 4, 2, 10, 10, 8, 5, 5, NA, 3, 6, 4, NA, 10, 10, 6, 4, 1, 3, 6, 6, 4, 5, 3, 4, 4, 5, 4, 5, 5, 9, 8, 5, NA, 3, 10, 3, 6 )
breast_cancer_x <- data.frame( X1 = c( v1, 1, NA, 4, NA, 5, NA, 1, 4, 4, 4, 6, 4, 4, 4, 1, 3, 8, 1, NA, 2, 1, 5, 5, 3, 6, 4, NA, NA, NA, 4, 1, 4, 10, 7, NA, 3, 4, NA, 6, 4, 7, NA, NA, 3, 2, 1, 5, NA, NA, 6, NA, 3, 5, 4, 2, 5, 6, NA, 3, 7, 3, 1, 3, 4, 3, 4, NA, 5, NA, 5, 5, 5, 1, 3, NA, 5, 3, 4, 8, 10, 8, 7, 3, 1, 10, 5, 5, NA, 1, 1, 5, 5, 6, NA, 5, 1, 8, 5, 9, 5, 4, 2, 10, 5, 4, 5, 4, 5, 3, 5, 3, 1, 4, NA, 5, 10, 4, 1, 5, 5, 10, NA, 8, NA, 2, 4, 3, NA, 4, 5, NA, 6, 7, 1, 5, 3, 4, 2, 2, 4, 6, 5, 1, NA, 3, NA, 10, 4, 4, 5, 4, NA, NA, 1, NA, 3, 1, 1, 5, 3, 3, 1, 5, 4, NA, 3, 5, 5, 7, 1, 1, 4, 1, 1, NA, NA, 5, NA, NA, 5, 3, 3, 2, NA, NA, 4, 1, 5, 1, 2, 10, 5, 5, 1, NA, 1, 1, 3, NA, 1, 1, 5, 3, 3, 3, 2, 5, 4, NA ))
v1 <- c( NA, 4, NA, 8, 1, 10, 1, 1, 1, 2, 1, 1, NA, 1, 7, 4, 1, 1, NA, 1, 3, 5, 1, 4, 1, 2, 2, 1, NA, 1, 1, 1, 7, NA, 1, NA, 10, 2, NA, 5, NA, 4, 10, 6, NA, 1, 7, 1, NA, NA, NA, 3, 3, 5, 5, 6, 10, 2, 2, 5, 3, 1, 10, 3, 1, 4, 1, 3, NA, 1, NA, 10, NA, 4, 6, 1, 1, 3, 1, NA, 2, 1, 2, 1, 5, 10, NA, 6, NA, 1, NA, NA, 1, 1, 1, 1, 1, 1, 6, 5, 3, 3, NA, 2, 10, 3, 10, 6, 1, 5, 3, 6, 3, 10, NA, NA, 3, NA, 1, 2, NA, NA, 10, 3, 4, 1, 5, 1, 3, 1, 1, 1, 10, NA, 1, NA, 1, 1, 1, 1, 1, 1, 5, NA, 1, 1, 4, NA, NA, 8, 1, 2, 10, NA, 1, 5, 2, 1, NA, 9, 7, 1, 1, 1, 1, 1, 6, 8, 1, 1, NA, NA, NA, 10, NA, 8, 1, 10, 1, 3, NA, 1, 1, 8, 7, 1, 5, 5, 8, 2, NA, 5, NA, 1, 1, 1, 4, 1, 1, NA, 7, 8, 1, 1, 1, NA, 10, 1, 1, 1, 10, 10, 1, 10, NA, 7, NA, 1 )
breast_cancer_x <- cbind(breast_cancer_x, c( v1, 10, 1, 1, 6, 1, 5, 5, 1, 5, 9, 1, 10, 4, 8, NA, 4, NA, 1, 8, 8, 10, 4, 1, 1, 1, 1, 1, 1, 10, 4, 1, 1, 2, NA, 3, 10, 10, 6, 1, NA, 1, 7, NA, 10, NA, 4, NA, 1, 10, 3, 8, 1, 4, 1, NA, 2, 1, 1, 1, 1, NA, 5, 1, NA, 4, 4, 4, 10, 10, 1, NA, 6, 1, 1, 8, 4, 1, 5, 3, NA, 2, 1, 4, 1, 10, 1, NA, 8, NA, 1, 8, 1, 1, 1, NA, 1, 1, 6, 5, 8, NA, 4, 6, 1, 1, 4, 1, 2, NA, 1, NA, 4, 4, 1, 2, 4, 6, 1, 5, 1, 1, 5, 3, 1, NA, NA, 6, 1, NA, 1, 4, 2, 1, 1, 4, 7, 1, 1, NA, 10, 10, 3, 10, 10, 2, NA, 1, 1, 10, 8, NA, 1, 3, NA, 1, 1, 1, 1, 1, 1, 1, 3, 1, 6, NA, 1, 1, 3, 6, 3, 1, 1, 1, 8, 1, NA, 2, 1, 1, 1, 2, 2, NA, 1, 3, 1, NA, 1, 2, 1, 3, 1, 1, 1, 10, 1, 5, 3, 7, 1, 2, 3, 2, 10, 3, 1, 1, NA, NA, 7, 1, 1, 3, NA, 1, NA, 9, 8, 10, 1, NA, NA, NA, 2, 1, 1, NA, 1, NA, 1, 1, 7, 1, 1, 1, NA, 3, NA, 6, 8, 1, 1, 1, 1, 1, 1, 1, 9, 6, NA, 1, 1, 1, 1, NA, 1, 1, 1, 1, 1, 2, NA, NA, NA, 10, 7, 1, 1, 1, 10, NA, NA, 1, 8, NA, 10, NA, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 3, 10, 1, NA, 1, 1, 1, 1, 1, 6, 10, 1, 1, 1, 7, 1, 1, 4, 5, NA, 1, 1, NA, NA, 1, 4, 2, NA, 1, 1, 1, NA, 1, NA, NA, 1, 1, NA, 1, 1, 1, 10, 1, 1, 8, 1, 1, 2, 4, 1, NA, 2, 1, 1, 1, 1, 1, 1, 1, 1, 7, NA, 1, 4, 10, NA, 6, 1, 1, 9, 1, 1, 1, 1, NA, 1, 7, 10, 1, NA, 1, 10, 1, 8, 1, 10, 5, NA, NA, 8, NA, 1, 1, 1, NA, 1, 1, 1, 4, 3, 5, 1, 1, 10, 1, 4, 10, 10, 3, 1, 1, 1, NA, 1, 1, NA, 3, 1, 1, 1, 1, 6, 1, 1, NA, NA, 1, 1, 7, 1, 1, 10, 2, 1, 1, 1, NA, 1, 1, 1, NA, NA, 1, 10, 1, 1, 2, 1, 1, NA, 1, 1, 4, NA, 1, 1, 1, NA, 1, 1, 1, 2, 1, 7, 10, NA, 2, 1, 3, 1, 1, 1, 1, 1, 1, 10, 10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 10, 1, 1, 1, 1, 10, 8, 8 ))
v1 <- c( 1, 4, 1, 8, 1, 10, NA, 2, 1, 1, 1, 1, 3, NA, 5, 6, 1, 1, 7, 1, 2, 5, 1, 5, 1, 3, 1, NA, 1, 3, NA, 1, 7, 1, 2, 1, 10, 1, 4, 3, NA, NA, NA, 5, NA, 1, 7, 1, 1, 7, 8, 3, NA, 5, 5, 6, 10, 4, 3, 5, 5, 1, 10, 4, NA, 2, 1, 4, 8, 1, 3, 2, 3, 5, 4, 2, 4, 1, 1, 1, 2, 1, 1, 1, 7, 6, 6, 6, 1, 1, NA, NA, 1, 1, 1, 1, 1, 1, NA, 6, 5, 4, 2, 3, 10, 4, NA, 8, 1, 4, NA, 4, 3, 10, 2, 1, 3, 5, 1, 1, NA, 1, 10, 5, NA, 1, 3, 1, 5, 1, 3, 1, 8, 1, 1, 1, 1, NA, 2, 1, NA, 1, 5, 1, 1, NA, 5, 1, 1, 7, 1, 4, 8, 1, 1, 5, 2, 1, 2, 10, 7, NA, 1, 1, 1, 1, 7, 10, 1, 1, 1, 1, NA, 10, 5, NA, 1, 10, 1, 3, 1, NA, 1, 8, 6, NA, 8, 6, 4, 3, NA, 10, 1, 1, 1, 1, 4, 1, 1, 1, 7, 8, 1, 1, 1, NA, 9, 1, 1, 1, NA, 8, 1, 10, 10, 8, 1, NA, 7, 3, 1, NA, 1, 6, 5, 1, 7, 9, 1, NA, NA, NA, 6, 5, 2, 4, 8, 8, 10, 3, 3, 1, 1, 1, 1, 1, 10, 4, 1, 1, 2, 4, 3, 10, 10, 6, NA, 1 )
breast_cancer_x <- cbind(breast_cancer_x, c( v1, 1, NA, 8, 10, 9, NA, 4, NA, 6, 5, 8, 1, 7, NA, 5, 4, 1, 3, 1, 1, 1, 7, NA, 1, 3, 6, 5, 10, 10, 1, 3, 6, NA, 1, 8, 4, 1, 7, 4, 3, 1, 2, 10, 1, 10, 1, 4, 4, 1, NA, 7, 1, 1, 1, 4, 1, 1, 5, 5, 7, 1, 4, 3, 1, 1, 6, 1, 2, 1, 1, 3, 6, NA, 1, NA, 6, 7, 1, 5, 1, 1, 5, 3, NA, 1, 1, 4, 1, 2, 1, 4, 3, NA, 1, 5, 10, 1, 1, 3, NA, 5, 5, NA, NA, 2, 4, NA, 1, NA, 8, 3, 3, NA, 3, 2, 1, 2, NA, NA, 1, 1, 4, 1, 3, 2, 1, 1, 2, 6, NA, 1, 1, 1, 7, 1, NA, 3, 1, 1, 1, 1, 3, 8, NA, 3, 1, 1, NA, NA, 1, 2, 2, 1, 1, 10, 2, 6, 2, 8, NA, 2, 1, 2, 10, 3, 3, 1, 10, 6, 8, 1, 1, 1, 1, 1, 2, 7, 10, NA, 1, NA, 1, 3, 2, 1, 1, NA, 1, 1, NA, 1, 9, NA, 1, 1, 5, 1, 2, 5, 9, 2, NA, 1, 1, 1, NA, NA, 8, 6, 6, 1, NA, 1, 1, 1, 1, 1, 1, 2, NA, 1, 7, 1, 2, 10, 8, 2, NA, 1, 10, 4, NA, 1, 9, 1, NA, NA, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, NA, 10, 1, NA, 1, 1, 1, 1, 1, 7, 4, NA, NA, NA, 8, 1, 1, NA, 6, 1, 1, 1, 1, 3, 1, 4, 2, 1, 1, NA, 3, 1, 2, NA, 1, 1, 1, 1, NA, 3, 1, 10, 1, 1, 3, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, NA, 1, 10, 2, 1, NA, 8, 4, 10, 1, NA, 7, 2, NA, 1, 1, 1, 2, 10, 5, 1, 1, 1, 10, 1, NA, 1, 8, NA, 4, 2, 6, NA, 2, 3, 1, NA, NA, 1, 1, 6, NA, 10, 1, 1, 10, 1, 3, 10, 10, 1, 1, 3, 1, 1, 1, NA, 1, NA, 2, 1, 1, 3, NA, NA, 1, 1, 3, 1, 1, 4, 1, 4, 7, 4, 1, NA, 1, 1, 1, 1, 1, 1, 2, NA, 10, 1, 1, NA, 1, 1, 1, 1, 1, 5, 8, 1, NA, 1, 3, 3, 1, NA, 2, NA, 4, 10, 7, 1, 1, 2, NA, 4, 2, 1, 1, NA, 10, 10, NA, 1, 1, 1, 1, 1, 1, 1, 1, 10, 1, NA, 1, 1, 10, 6, 8 ))
v1 <- c( 1, NA, 1, NA, 3, 8, NA, 1, NA, 1, 1, 1, 3, 1, 10, 4, 1, 1, 6, 1, 10, NA, 1, NA, 1, 4, 1, 1, 1, 1, 1, NA, 3, 2, 1, 1, 8, 1, 9, 3, NA, 1, 2, NA, 4, 1, 4, 1, 3, NA, 1, NA, 2, 8, 6, NA, 1, 1, 1, NA, 5, 1, 1, NA, 1, 1, NA, 1, 3, NA, 1, 8, 2, 10, 1, 1, NA, 2, 1, 1, 1, 2, 1, 1, 8, 1, 4, 6, 1, 2, 1, 2, NA, 1, 1, 1, NA, 1, NA, 10, 1, NA, 1, 1, NA, 4, 8, 10, NA, 4, 2, 3, 10, 3, NA, 1, 1, 10, 1, NA, 2, 1, NA, 1, 7, 1, 7, 1, 4, 1, 1, 1, NA, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 2, 1, 3, NA, 1, 1, 6, 1, 1, 6, 1, 1, 1, NA, 4, 1, 1, 2, 1, 1, 8, 10, 1, 2, 1, 1, NA, 10, 4, 7, 1, 3, NA, 3, 1, 1, NA, 8, 4, 1, NA, 10, 10, 1, 8, 10, 1, 1, NA, 1, NA, 4, 1, 1, 5, 4, 1, 1, 1, 9, 3, 1, 1, 1, 10, 8, 1, NA, 10, 7, 1, 1, 7, 1, NA, 3 )
breast_cancer_x <- cbind(breast_cancer_x, c( v1, 3, 3, 6, 1, 4, 5, 1, 3, 4, 5, 3, 5, 1, 1, NA, 5, 8, 2, 3, 3, 1, 1, 1, 2, 8, 1, 1, 1, NA, 10, 5, NA, 1, 2, 1, 1, 1, 1, 10, 6, 4, 10, NA, 1, 3, 2, 2, 1, 1, 1, 2, 1, 1, NA, NA, 1, NA, 3, NA, NA, 10, 1, NA, 10, 10, 1, 1, 8, 1, 1, 1, 6, 1, 8, 3, 1, 1, 6, 5, 1, 7, 1, NA, 4, 1, 1, 6, NA, 1, 1, 10, 1, NA, 6, NA, 8, NA, 4, 2, 1, 1, 10, NA, 1, 1, NA, 2, 4, 2, NA, NA, 6, 3, 1, 8, NA, NA, 5, 1, 1, 1, 1, 8, NA, 2, NA, 10, NA, 3, 1, 3, 10, 1, 1, 1, 7, 3, 4, 10, 10, 1, 2, 1, 1, 10, 10, NA, 1, 1, 1, 1, 2, 1, 1, NA, 1, 4, 1, 1, 6, 2, 1, 1, 2, 3, 2, 1, NA, 2, 4, 1, 1, 1, 1, 1, 1, 1, 1, 7, 1, 1, 1, NA, 1, 1, NA, 2, 1, 1, 1, 6, NA, 2, 6, 5, 1, 2, 1, NA, 7, 1, NA, 1, 10, 1, 2, 1, 1, 1, 3, 1, 3, 5, 1, NA, 1, 3, 1, 10, 4, 3, 1, 6, 1, 1, 1, 1, 8, 3, 1, NA, NA, 1, 1, 8, 6, 1, 1, 3, 1, 3, 1, 1, 7, 2, 5, 1, 1, 1, 3, NA, NA, 1, 1, 1, 1, 1, 10, 1, 4, NA, 5, 1, 3, 1, 10, 10, 1, 1, 4, 1, 10, 10, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 10, NA, 1, 1, 1, 1, 1, 1, NA, 7, NA, NA, 2, 3, 1, 1, 4, 10, NA, NA, NA, 1, 2, 1, 3, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 10, 1, NA, 7, 1, 1, NA, 1, 1, 1, 2, 3, 1, NA, 1, 1, 1, 1, 1, NA, 1, 1, 1, 10, 4, 5, 1, 1, 3, 1, 1, 1, NA, 1, 1, 6, 5, 1, 6, 1, 10, 1, 9, NA, 5, 6, 5, 1, NA, 1, 1, 1, NA, NA, NA, 1, 1, 8, 8, 3, 2, 1, 10, 1, 10, 10, NA, 1, 1, 1, 1, 1, 1, 1, NA, 3, 3, 1, 2, NA, NA, 1, 1, 1, 1, NA, 1, 4, 1, 1, 8, NA, NA, 3, 3, 1, 1, 1, 1, NA, 1, 3, 10, 1, 2, 3, NA, 1, 1, 1, 1, 1, 7, 1, NA, 1, 1, NA, NA, 1, 2, 1, 1, 8, NA, 2, NA, 1, 1, 1, 1, 1, 1, 1, 10, NA, 1, 1, 1, 1, 1, 1, 1, 1, 3, 5, 1, 1, NA, 1, 3, 4, 5 ))
v1 <- c( NA, 7, 2, NA, 2, 7, 2, 2, 2, 2, 1, 2, NA, 2, 7, 6, 2, 2, 4, 2, 5, 6, 2, 2, 2, NA, 1, NA, 2, 2, NA, 2, 8, 2, 2, 2, 6, NA, 2, NA, NA, 3, 8, 10, 8, 2, 4, 2, 2, 4, 2, 2, 3, NA, 8, 4, 3, NA, 6, NA, 3, 2, 10, 5, 2, NA, 2, 8, NA, 2, 2, 10, 2, 6, 3, 2, 2, 2, 2, NA, 1, 2, 2, 2, NA, 10, 5, 5, 2, 3, 2, 2, 2, NA, 2, 2, 2, NA, 10, 5, 10, 2, NA, 6, 10, NA, 2, NA, 2, 3, 2, 5, 2, 10, 2, 2, 2, 4, NA, NA, 2, 2, 10, 8, 9, NA, 4, 2, 5, 10, 2, 2, 8, 2, 3, NA, 2, 2, NA, 1, NA, 2, NA, 2, 2, 2, 6, 3, 8, 10, 1, 6, NA, 2, 2, NA, 2, 2, 3, 6, 5, 2, 2, 1, 2, NA, 8, NA, 2, 1, 2, NA, 2, 8, NA, 10, 2, 8, NA, NA, 1, 2, 2, NA, NA, 1, 5, 6, 5, 2, 6, 10, 2, 2, 2, 2, NA, 2, 2, NA, 5, 10, 2, 2, 2, 6, 7, 1, 1, 1, 5, NA, 2, 7, 3, 5, 2, NA, 6, 2 )
breast_cancer_x <- cbind(breast_cancer_x, c( v1, NA, NA, 1, 3, 3, 2, NA, 3, 1, 10, 3, 6, 3, 5, 3, 2, 8, 6, 6, 3, 2, 1, 2, 2, 2, 2, 5, 2, 2, 2, 2, NA, 3, 8, 10, 4, 2, 2, 2, 5, NA, 10, 5, NA, 10, 2, NA, 3, 3, NA, 3, 2, NA, 3, 2, 2, NA, 2, 2, 3, 2, 2, 4, 2, 2, 8, 10, 3, 4, 6, 2, 2, 2, NA, 2, 6, 4, 2, 5, 4, NA, 2, 9, 2, 3, 4, 2, 2, 4, 2, 3, 2, 10, 2, 1, NA, 5, 6, 5, 6, 5, 2, 2, NA, 2, 2, 2, 2, 6, 5, 2, 2, 2, 4, 3, 2, 4, 2, 1, 2, 2, 2, 2, 2, 10, 2, 3, 1, 5, 3, 2, 2, 7, NA, 2, 3, 3, NA, 8, 3, 10, 6, 4, 2, 2, 2, 8, 5, 2, 1, 3, 2, 2, 2, 2, 2, 2, 1, 3, 4, 2, 4, NA, 2, 2, 3, 2, 3, 2, NA, 2, 3, 2, NA, NA, 2, NA, 2, 2, NA, NA, 2, 2, 2, 2, 2, 2, 2, NA, NA, 2, 1, 8, 2, 3, 3, 10, 2, 2, 5, 2, 10, 2, 2, 2, 10, NA, 4, NA, 2, 2, 4, 2, 2, 5, 3, NA, 2, 2, 2, 4, 2, 2, 2, 3, NA, 2, 2, 1, 6, 1, 2, NA, 6, 3, 2, 5, 6, 2, 2, 2, NA, NA, 2, 2, 6, 4, NA, 2, 2, 1, 2, 1, 2, 2, NA, 2, 2, 2, 4, 1, 2, 10, NA, 2, 1, 1, 6, 3, 3, 2, 3, 1, 6, 4, 1, 1, 2, 2, NA, 2, 2, 2, 2, 2, 2, 7, 2, 2, NA, 2, 2, 2, NA, 3, 3, NA, 1, 2, 4, 3, 3, NA, 4, 2, 2, NA, NA, NA, 1, NA, 2, NA, 2, 2, 2, 2, 2, 2, 2, 2, 2, NA, 2, 2, 2, 4, 1, 1, 4, 2, NA, 2, 2, 2, 2, NA, 2, 2, 2, NA, 2, 2, 2, 2, 5, 2, 2, 6, 6, 8, 3, 2, 2, NA, 2, 2, 2, NA, 2, 2, NA, 4, NA, 3, 2, 6, 2, 6, 2, 4, 4, 3, 2, 4, 2, 2, 2, 2, 1, NA, 1, 2, 4, 5, 5, 2, 2, 10, 2, 3, 5, NA, 2, 1, 2, 2, NA, 2, 2, 2, 3, 2, NA, 1, 3, 7, NA, 2, 2, 2, 2, 2, 5, 2, 2, 7, 2, 2, NA, 2, 2, 2, 2, 2, 2, 2, 2, 10, 2, NA, NA, 2, 2, 2, 2, NA, 8, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 5, 5, 2, NA, 3, 2, 2, NA, 2, 2, 2, 5, 4, 2, NA, 2, NA, 2, 2, 2, 2, 2, 4, NA, 2, NA, 2, 7, 3, 4 ))
v1 <- c( 1, 10, NA, 4, 1, 10, 10, 1, 1, 1, 1, 1, 3, 3, 9, 1, 1, 1, 10, NA, 10, 7, 1, NA, 1, 7, 1, 1, 1, 1, 1, 1, NA, NA, 1, 1, 1, 1, 10, NA, NA, 3, NA, 1, 1, 1, 9, NA, 1, 8, 3, 4, 5, 8, 8, 5, 6, 1, 10, 2, 3, 2, 8, 2, 1, 2, 1, 10, 9, 1, 1, 2, 1, 10, NA, 2, 1, 1, 3, 1, 1, 1, 1, 2, 9, 4, 8, 10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6, 10, 5, 5, 1, 3, 1, 3, 10, 10, 1, 9, 2, 9, 10, 8, 3, 5, 2, NA, 3, NA, 1, 2, 10, 10, 7, 1, NA, NA, 10, NA, 1, 1, 10, 1, 1, 2, 1, 1, 1, NA, 1, 1, NA, 5, NA, NA, 8, 2, 1, 10, 1, 10, 5, 3, NA, 10, 1, 1, NA, 10, 10, 1, 1, 3, NA, 2, 10, 1, 1, 1, 1, 1, 1, 10, 10, 10, 1, 1, 1, NA, 1, 1, 1, 10, 10, 1, 8, NA, 8, NA, 8, 10, 1, NA, 1, 1, 7, 1, 1, 1, 10, 10, 1, 1, 1, 10, 5, 1, 1, 1, 10, 8, 1, 10, 10, 5, 1, 1 )
breast_cancer_x <- cbind(breast_cancer_x, c( v1, 4, NA, 1, 10, NA, 8, 10, 1, 10, 5, 1, NA, 7, 8, 1, NA, 1, NA, 10, 2, NA, 10, 2, NA, 1, 5, 1, NA, 10, 9, 1, NA, NA, 10, 10, 10, 8, 10, 1, 1, NA, 8, 10, 10, 10, 10, 3, 1, 10, 10, NA, NA, 10, 1, 10, 4, 1, NA, 1, 1, 1, 7, 1, 1, 10, NA, 10, 10, 10, 1, 5, 10, 1, 1, NA, NA, NA, 10, 5, NA, 1, NA, 4, 1, 10, 1, 10, 10, 1, 1, NA, NA, 1, 1, 1, 1, 1, NA, 10, 8, 1, 5, NA, NA, 1, 10, 1, 1, 10, 1, 4, NA, 8, 1, 1, 10, 10, 1, NA, 1, NA, 10, 10, NA, NA, 1, NA, 1, 1, 1, 1, 8, 1, 1, 3, 10, NA, 1, 3, 10, 4, 7, 10, 10, 3, 3, 1, 1, 10, NA, 1, 1, 1, 1, 1, 1, NA, 1, 1, 1, 1, NA, 1, 10, NA, 1, 1, 1, 10, 1, 1, 2, 1, 10, 1, 1, NA, NA, NA, 1, 1, 1, 9, 1, 1, 4, 1, 1, 1, NA, 2, 1, NA, NA, 4, NA, 10, 3, 10, 1, 2, 1, 3, 10, 1, NA, 1, 10, 1, 2, NA, 1, 1, 1, 1, 1, 8, 10, NA, 1, 1, 1, 10, 4, NA, 2, 1, 1, 1, 1, 1, 10, NA, 1, 1, 10, 1, 6, NA, NA, 1, 1, 1, NA, 1, 1, 1, 4, 10, 10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 10, 1, 1, 5, NA, 1, NA, 1, 10, 3, 4, 1, 10, 1, 10, 5, 1, 1, 1, 1, 1, 1, NA, 1, 1, NA, 1, 5, 4, 1, 1, 1, 1, NA, NA, 10, 10, 1, 1, 1, 10, 1, 1, 5, 10, 1, 1, 1, NA, 1, 1, 10, 1, 1, 1, 1, NA, 1, 1, 1, NA, 2, 1, NA, 1, NA, 1, 10, 1, 1, NA, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 10, 1, 3, 10, 5, 10, 10, 1, NA, NA, 1, 1, 1, 1, NA, NA, NA, 10, 1, NA, 1, 10, 1, 3, NA, 1, NA, 10, 1, 10, 1, 1, 1, 1, 1, 1, 1, 1, 1, NA, 8, 1, 1, 10, 1, 10, 2, 10, 1, 1, 1, 1, NA, 1, 1, NA, 2, 1, 1, 1, 4, 6, 5, NA, 1, 1, 1, NA, 3, 1, 1, 1, 2, 1, 1, NA, 1, 1, NA, 1, 1, 1, 1, NA, 1, 4, 1, 1, 1, 1, 1, 1, 1, 10, 1, 1, 1, NA, 1, NA, NA, 1, 1, 1, NA, 8, 1, 1, 1, 1, NA, 1, 1, 1, 1, 10, 10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 2, 1, 3, 4, 5 ))
v1 <- c( 3, 3, 3, 3, 3, NA, 3, 3, NA, 2, 3, 2, 4, 3, 5, NA, 2, 3, 4, 3, 5, 7, NA, 7, 3, 3, 2, NA, NA, 1, 2, 3, 7, 3, 2, 2, 8, 7, 5, 7, 7, 6, 7, 3, 8, NA, 4, 2, 3, NA, 2, 3, 4, 7, 7, 3, 3, 5, 5, NA, 4, NA, 3, 3, 2, 4, 3, 4, 8, 3, 2, 7, 7, 4, 3, 4, 2, 2, 3, 2, 7, 2, 3, 7, 7, 4, NA, 6, 3, 2, 3, 1, 3, NA, 3, NA, 1, NA, 2, 7, 3, 2, NA, 7, 8, 3, 4, 5, 2, 7, 5, 3, 7, NA, 3, 1, NA, NA, 1, NA, NA, 3, 5, 5, 8, 2, 7, NA, 1, 1, 2, 3, 3, 2, 2, 3, 2, 1, 2, 2, 1, 1, 4, 1, 2, 2, 4, 2, 5, 7, 3, NA, 8, NA, 1, NA, 2, 3, 1, NA, 5, 3, 3, 1, 3, 3, 3, 3, NA, 1, 1, 3, 2, 10, 6, 5, NA, 5, 3, 3, 3, 1, 3, 7, 5, 3, 7, 7, 9, 3, 7, 4, 2, 3, NA, 3, NA, 3, NA, 2, 7, 8, 3, NA, 3, NA, 3, 3, 3, 3, 8, 7, 3, 7, 10, NA, 2, 3, 8, 3, 3, 9, 2, NA, 7, 2, 8, 7, 3, 9, NA, 8, 4, 4, 3, NA, NA, 4, 3, 5, 2, 3, 3, 5, NA, 3, 7, NA, NA, 3, 1, 5, 3, 7, 3, 3, 1, 2, 3, 3, 5, NA, 7, 5, 5, 3, 4, 7, 8, 3, NA, 3, NA, 3, NA, 2, 2, NA, 3, 3, 3, NA, 5, 5, 3, NA, 4, 2, 5, 4, 1, 3, 6, 2, NA )
breast_cancer_x <- cbind(breast_cancer_x, c( v1, 7, 4, 2, 1, 7, 7, 3, 7, 3, 3, 3, 3, 3, 8, 5, 2, 1, 3, 1, NA, 4, 4, 8, 3, 7, 7, 3, NA, 4, 3, 2, 5, 2, 3, 7, 6, 3, NA, 4, NA, 1, 3, 3, 2, NA, NA, 3, 1, NA, NA, 1, 1, 1, 3, 7, 1, 3, 4, NA, NA, 2, 3, 7, 4, 3, 8, 5, NA, 2, 3, 2, NA, 8, 1, NA, 2, 1, 2, NA, 2, NA, 2, 2, 2, 3, 1, 7, NA, 1, 1, 1, 7, 3, 2, 2, 2, 7, 2, 1, 2, 2, NA, 1, 2, 1, 9, 1, 2, 1, 1, 2, 2, 2, 3, 2, 2, NA, 8, NA, 6, 3, 7, 2, 3, 1, 3, 8, 3, 2, NA, 10, NA, 5, 2, 2, 2, 3, 2, 1, 4, NA, 2, 1, 1, 1, 10, NA, 1, NA, 2, 1, 1, 1, 1, NA, 2, 1, 1, 10, 1, 1, 8, 10, 1, 1, 1, 1, 1, 1, 1, 7, 9, 7, 1, 2, 2, 1, NA, 1, 1, NA, 1, 1, 1, 7, 1, 1, 10, 9, NA, 1, 2, 8, 3, 4, 1, 7, 2, 6, 2, 2, 1, 1, 2, 2, NA, NA, 2, 3, 1, 1, NA, 1, 1, 1, NA, NA, 1, 2, 8, 9, 1, 2, 1, 9, 1, NA, 7, NA, 2, 1, NA, 3, 1, 2, 6, 2, 3, NA, 2, 3, 3, NA, 2, 2, 2, 1, 1, 2, 2, 2, 7, 1, 1, 7, 2, 3, 4, 2, 1, 4, NA, 1, NA, 2, 3, 3, 3, 2, 3, 10, NA, NA, 2, 10, 8, 9, NA, 2, 7, 3, NA, 2, 2, 3, 2, 7, 6, 1, 1, 1, 10, NA, 4, NA, 10, 7, 4, 1, 7, 2, 2, NA, NA, 1, 2, 2, 2, 8, NA, 7, 1, 1, 10, 1, 7, 8, 10, 2, 2, 2, NA, 1, 2, 2, 2, NA, 2, NA, 2, 1, 7, NA, 1, 1, 1, 2, 1, 5, NA, 1, 10, 2, 1, 1, NA, 2, 2, 1, 1, 2, 1, NA, 10, 2, 1, 2, 2, 2, NA, 2, NA, 3, 7, 1, 2, 3, NA, 2, 2, 1, 1, 3, 7, 7, 7, 3, 3, 1, 2, 1, 2, NA, 1, 1, 10, 5, 3, 1, 1, 1, 1, 2, 1, 1, NA, 4, 1, 2, 1, 1, 8, 10, 10 ))
v1 <- c( 1, NA, 1, 7, NA, 7, 1, 1, 1, NA, 1, 1, NA, 1, NA, 3, 1, 1, 1, 1, 4, 10, 1, 3, 1, 6, 1, 1, 1, 1, 1, 1, NA, 1, 1, NA, 9, 1, 6, NA, 8, NA, 3, 1, 10, NA, 8, NA, 1, 8, 1, 4, 10, 3, 1, 6, NA, 4, 1, 1, 10, 1, 3, 9, 1, 3, 1, 9, 9, 2, NA, 8, 2, 8, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, NA, 10, 4, 8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, NA, 9, NA, 5, 1, 1, 8, 2, 1, NA, 3, 8, 3, 1, 3, 1, 1, NA, 2, 5, 1, 1, 1, 1, 3, 3, 10, NA, 5, 1, NA, NA, 1, 1, 6, 2, 1, 3, 1, 1, 1, 1, 1, 1, 3, 1, NA, 1, 1, 1, 8, NA, 1, 4, NA, 1, 1, 1, 1, 1, 1, 10, 7, 2, NA, 1, 1, 2, 10, 1, 1, 1, 1, 1, 1, NA, NA, 7, NA, 10, 1, 1, NA, 1, 1, 8, NA, NA, 10, 7, 10, NA, 10, 10, NA, 1, 1, 1, NA, 1, 1, NA, NA, 1, NA, NA, 1, 10, 5, NA, 1, NA, 10, 7, 1, 10, 6, 10, 1, 1, 10, 1, 1, 10, 1, NA, 9, 1, 9, 7, 1, 10, 6, 9, 3, 1, 6, 1, 8, 10, 10, NA, 3, 1, 1, 1, 1, 1, 8, 3, 6, 1, 1, 3, 5, 3 )
breast_cancer_x <- cbind(breast_cancer_x, c( v1, 3, 6, 1, 1, 1, 4, 1, NA, 8, 5, 3, 2, 3, 1, 7, 1, 9, 1, 1, 3, 2, 1, NA, 1, NA, 3, 1, 1, 6, NA, 8, 7, 10, 1, 10, 10, 1, 1, 10, 3, NA, 4, 7, NA, 1, 7, 10, 1, NA, 1, 3, 10, 1, 1, 8, NA, NA, NA, 5, 1, 1, 9, 3, NA, 1, 3, 4, 1, 1, NA, 1, 3, 4, NA, NA, 1, 1, NA, NA, 3, 4, 1, 4, 1, 1, NA, 6, 1, NA, NA, NA, 1, 3, 3, 3, 6, 1, 1, 6, NA, 1, 2, 3, NA, 10, 5, 10, 1, NA, 1, 1, 1, NA, 10, 1, NA, NA, 1, 1, NA, NA, 1, 1, NA, 2, NA, 1, 8, 2, 1, 1, 2, 1, 1, 2, 2, 1, 9, 1, 1, 1, NA, 1, 1, NA, 1, 3, 1, 1, 1, 2, 1, 1, 1, 1, NA, 1, 1, NA, 1, 6, 5, 2, 1, 2, 1, NA, 2, 3, 1, 1, 10, 1, 10, 1, 1, 2, 2, 2, NA, NA, NA, 8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, NA, 1, 10, 1, 1, NA, 7, 1, 1, 6, 10, 1, 1, 1, 1, 1, NA, 1, 10, 7, 6, 1, 1, 1, 1, 1, NA, 1, 1, NA, 1, 1, 5, 1, 1, 10, 10, 1, 1, 1, 1, 4, NA, 1, NA, NA, 5, 1, 1, NA, 1, NA, 1, 1, 1, 1, NA, 1, 1, 8, 1, 1, 1, 1, 1, 1, 1, NA, 10, 1, 1, 1, 1, 1, 1, 3, 3, 1, 1, 1, NA, 1, 1, 9, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, NA, 1, 1, 10, 1, 1, 8, 1, NA, 2, 1, NA, 8, 1, 1, 1, 1, 1, 1, NA, 1, 2, 10, 1, 1, 5, 3, NA, 10, 1, 1, 7, 1, 1, 1, 1, 1, 1, 5, 10, 1, 1, 1, 10, NA, 1, 1, 1, 6, 1, 1, 1, NA, 1, 1, 1, 1, 1, 1, 1, 10, 1, 8, 1, 1, NA, 1, 1, 5, 10, 1, NA, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 7, 1, 1, 1, 1, 1, 1, 10, NA, 1, 10, 1, 1, 1, 1, 1, 1, 1, NA, 1, 1, 1, 10, 1, 1, NA, 2, 1, 1, 1, 1, 6, 2, 1, 1, 1, 1, 1, 1, 1, 1, NA, 10, 10, 4, 1, 1, 1, NA, 1, 1, NA, NA, 1, 10, NA, 2, 1, 1, 1, NA, 3, 1, 1, NA, 4, 1, 1, 1, 1, 10, NA, 4 ))
v1 <- c( 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, NA, 1, 1, 4, 1, 1, 1, NA, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, NA, 1, 3, 1, 1, 1, NA, 1, 1, 1, 1, 2, 3, 1, 1, 2, NA, 1, 1, 2, 5, NA, 2, 7, NA, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 10, 1, 1, 8, 1, 1, 10, NA, 1, NA, 1, NA, NA, 1, 1, 1, NA, NA, NA, 7, 10, 1, NA, NA, 1, NA, NA, 1, 1, 1, 1, 1, NA, NA, 4, 2, 1, 1, 1, 8, 7, 1, 1, NA, 3, 2, 1, 3, 1, 1, 1, 1, 8, 1, 1, 1, 1, 3, 1, 1, NA, 5, 1, NA, 1, 1, 1, 3, NA, 1, 1, 1, 1, 1, NA, 1, 1, 3, 1, NA, 1, 1, 1, 1, NA, 1, 3, 1, 1, NA, 1, NA, 1, 1, 6, 2, 1, NA, NA, NA, 1, 3, NA, NA, 1, NA, 1, 1, 7, 1, 1, NA, 3, 1, 1, NA, NA, 1, 1, 1, 1, 1, 10, 1, 1, NA, 3, 1, 1, 1, 1, 2, 1, 1, 1, 3, 1, 1, 1, 1, 5, 1, 1, 1, 1, NA, 1, NA, 4, 1, 2, NA, 1, 2, 1, 1, 1, 1, 1 )
breast_cancer_x <- cbind(breast_cancer_x, c( v1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, NA, 1, 10, NA, NA, 2, 1, 1, NA, 1, 1, 1, 1, 1, 1, NA, 1, 3, 3, 3, 1, 1, 1, 1, 1, 1, 3, 5, NA, 1, NA, 1, 2, 1, 8, 1, NA, 1, 1, NA, NA, 1, NA, 1, 1, 8, 1, 1, 1, 1, 2, 3, 10, 1, 1, 4, 1, 1, 1, 1, 1, 1, NA, 1, 1, 2, NA, 1, NA, 1, 1, NA, 1, NA, NA, 1, 1, NA, 1, 1, 1, 1, 1, 1, 1, 1, 6, 1, 1, 1, 1, NA, 1, 1, NA, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, NA, 1, NA, 3, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, NA, 8, 3, 3, 10, NA, NA, 1, 1, 1, 7, 3, 1, 1, 1, 1, 1, 1, NA, 1, NA, 1, 1, 1, 1, 4, 1, 1, 1, 3, NA, 1, 1, NA, 1, 1, 1, 1, 1, 1, NA, NA, 1, 1, NA, NA, 1, NA, NA, NA, 1, 1, 1, 1, 1, 1, NA, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, NA, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, NA, 1, 1, NA, 1, 1, 1, 1, 1, 1, NA, 1, 1, 1, 1, 1, NA, 1, 1, 1, NA, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, NA, 1, NA, 1, 1, 1, 1, 1, 1, 1, 1, NA, 1, 1, NA, 10, 1, NA, 1, 1, 5, NA, 1, NA, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 7, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, NA, NA, NA, 1, 1, 1, 1, NA, NA, 1, NA, 1, 1, NA, 1, 1, 1, 1, NA, 1, 1, 1, 1, NA, 1, NA, 2, 1, 1, 1, 2, 1, 1, 1, NA, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, NA, 1, 1, 1, 1, NA, NA, 1, 1, 1, 1, 1, NA, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, NA, 1, 2, 3, 1, NA, 1, 1, 2, 1, 10, 1, 1, 1, 1, 1, 1, 1, NA, NA, NA, 1, 1, 1, 3, 1, 1, NA, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, NA, 1, 1, 10, 1, 1, 1, 1, NA, 1, NA, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 2, 1, 3, 1, NA, 1, 1, 1, 1, NA, 1, 1, 1, 1, NA, 3, 1, 1, 1, NA, 1, 1, 1, 8, 1, 1, 1, 2, 1, 1, 2, NA, 1 ))
names(breast_cancer_x) <- c( 'Cl.thickness', 'Cell.size', 'Cell.shape', 'Marg.adhesion',
'Epith.c.size', 'Bare.nuclei', 'Bl.cromatin', 'Normal.nucleoli', 'Mitoses'
)
str(breast_cancer_x)
## 'data.frame': 699 obs. of 9 variables:
## $ Cl.thickness : num 5 NA NA 6 4 8 1 2 NA NA ...
## $ Cell.size : num NA 4 NA 8 1 10 1 1 1 2 ...
## $ Cell.shape : num 1 4 1 8 1 10 NA 2 1 1 ...
## $ Marg.adhesion : num 1 NA 1 NA 3 8 NA 1 NA 1 ...
## $ Epith.c.size : num NA 7 2 NA 2 7 2 2 2 2 ...
## $ Bare.nuclei : num 1 10 NA 4 1 10 10 1 1 1 ...
## $ Bl.cromatin : num 3 3 3 3 3 NA 3 3 NA 2 ...
## $ Normal.nucleoli: num 1 NA 1 7 NA 7 1 1 1 NA ...
## $ Mitoses : num 1 1 1 1 1 1 1 1 5 1 ...
v1 <- c( 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 1, 1, 2, 1, 2, 2, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 2, 2, 1, 2, 2, 2, 2, 1, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 2, 1, 2, 2, 1, 1, 2, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 1, 2, 1, 1, 1, 1, 2, 2, 2, 1, 2, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 2, 1, 2, 2, 1, 1, 2, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 2, 1, 2, 1, 1, 1, 2, 2, 1, 2, 2, 2, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 1, 2, 2, 2, 1, 1, 2, 1, 1, 2, 2, 2, 2, 1, 2, 2, 1, 2, 2, 2, 1, 2, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 2, 1, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 1, 2, 2, 1, 1, 1, 1, 1, 2, 1, 1, 2, 2, 2, 2, 2, 1, 2, 2, 1, 1, 2, 2, 1, 2, 1, 1, 1, 2, 2, 1, 2, 1, 2, 2, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 2, 1, 1, 2, 1, 2, 2, 2, 1, 1, 2, 2, 1, 2, 1, 1, 2, 2, 1, 1, 1, 2, 1, 1, 1, 2, 2, 1 )
breast_cancer_y <- ifelse(c( v1, 1, 1, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 2, 1, 1, 1, 2, 2, 2, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 2, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 2, 2, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 2, 1, 2, 1, 2, 2, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 2, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 2, 2 ) == 2, "malignant", "benign")
breast_cancer_y <- factor(breast_cancer_y, levels=c("benign", "malignant"))
str(breast_cancer_y)
## Factor w/ 2 levels "benign","malignant": 1 1 1 1 1 2 1 1 1 1 ...
# Create custom trainControl: myControl
myControl <- caret::trainControl(
method = "cv", number = 10,
summaryFunction = twoClassSummary,
classProbs = TRUE, # IMPORTANT!
verboseIter = TRUE
)
# Apply median imputation: model
model <- caret::train(
x = breast_cancer_x, y = breast_cancer_y,
method = "glm",
trControl = myControl,
preProcess = "medianImpute"
)
## Warning in train.default(x = breast_cancer_x, y = breast_cancer_y, method =
## "glm", : The metric "Accuracy" was not in the result set. ROC will be used
## instead.
## + Fold01: parameter=none
## - Fold01: parameter=none
## + Fold02: parameter=none
## - Fold02: parameter=none
## + Fold03: parameter=none
## - Fold03: parameter=none
## + Fold04: parameter=none
## - Fold04: parameter=none
## + Fold05: parameter=none
## - Fold05: parameter=none
## + Fold06: parameter=none
## - Fold06: parameter=none
## + Fold07: parameter=none
## - Fold07: parameter=none
## + Fold08: parameter=none
## - Fold08: parameter=none
## + Fold09: parameter=none
## - Fold09: parameter=none
## + Fold10: parameter=none
## - Fold10: parameter=none
## Aggregating results
## Fitting final model on full training set
# Print model to console
model
## Generalized Linear Model
##
## 699 samples
## 9 predictor
## 2 classes: 'benign', 'malignant'
##
## Pre-processing: median imputation (9)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 629, 630, 630, 628, 629, 629, ...
## Resampling results:
##
## ROC Sens Spec
## 0.9923897 0.9695169 0.9421667
# Apply KNN imputation: model2
model2 <- caret::train(
x = breast_cancer_x, y = breast_cancer_y,
method = "glm",
trControl = myControl,
preProcess = "knnImpute"
)
## Warning in train.default(x = breast_cancer_x, y = breast_cancer_y, method =
## "glm", : The metric "Accuracy" was not in the result set. ROC will be used
## instead.
## + Fold01: parameter=none
## - Fold01: parameter=none
## + Fold02: parameter=none
## - Fold02: parameter=none
## + Fold03: parameter=none
## - Fold03: parameter=none
## + Fold04: parameter=none
## - Fold04: parameter=none
## + Fold05: parameter=none
## - Fold05: parameter=none
## + Fold06: parameter=none
## - Fold06: parameter=none
## + Fold07: parameter=none
## - Fold07: parameter=none
## + Fold08: parameter=none
## - Fold08: parameter=none
## + Fold09: parameter=none
## - Fold09: parameter=none
## + Fold10: parameter=none
## - Fold10: parameter=none
## Aggregating results
## Fitting final model on full training set
# Print model to console
model2
## Generalized Linear Model
##
## 699 samples
## 9 predictor
## 2 classes: 'benign', 'malignant'
##
## Pre-processing: nearest neighbor imputation (9), centered (9), scaled (9)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 629, 628, 630, 629, 629, 630, ...
## Resampling results:
##
## ROC Sens Spec
## 0.9925008 0.9716908 0.9291667
# Fit glm with median imputation: model1
model1 <- caret::train(
x = breast_cancer_x, y = breast_cancer_y,
method = "glm",
trControl = myControl,
preProcess = "medianImpute"
)
## Warning in train.default(x = breast_cancer_x, y = breast_cancer_y, method =
## "glm", : The metric "Accuracy" was not in the result set. ROC will be used
## instead.
## + Fold01: parameter=none
## - Fold01: parameter=none
## + Fold02: parameter=none
## - Fold02: parameter=none
## + Fold03: parameter=none
## - Fold03: parameter=none
## + Fold04: parameter=none
## - Fold04: parameter=none
## + Fold05: parameter=none
## - Fold05: parameter=none
## + Fold06: parameter=none
## - Fold06: parameter=none
## + Fold07: parameter=none
## - Fold07: parameter=none
## + Fold08: parameter=none
## - Fold08: parameter=none
## + Fold09: parameter=none
## - Fold09: parameter=none
## + Fold10: parameter=none
## - Fold10: parameter=none
## Aggregating results
## Fitting final model on full training set
# Print model1
model1
## Generalized Linear Model
##
## 699 samples
## 9 predictor
## 2 classes: 'benign', 'malignant'
##
## Pre-processing: median imputation (9)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 629, 629, 630, 629, 630, 628, ...
## Resampling results:
##
## ROC Sens Spec
## 0.9913635 0.9694203 0.9461667
# Fit glm with median imputation and standardization: model2
model2 <- caret::train(
x = breast_cancer_x, y = breast_cancer_y,
method = "glm",
trControl = myControl,
preProcess = c("medianImpute", "center", "scale")
)
## Warning in train.default(x = breast_cancer_x, y = breast_cancer_y, method =
## "glm", : The metric "Accuracy" was not in the result set. ROC will be used
## instead.
## + Fold01: parameter=none
## - Fold01: parameter=none
## + Fold02: parameter=none
## - Fold02: parameter=none
## + Fold03: parameter=none
## - Fold03: parameter=none
## + Fold04: parameter=none
## - Fold04: parameter=none
## + Fold05: parameter=none
## - Fold05: parameter=none
## + Fold06: parameter=none
## - Fold06: parameter=none
## + Fold07: parameter=none
## - Fold07: parameter=none
## + Fold08: parameter=none
## - Fold08: parameter=none
## + Fold09: parameter=none
## - Fold09: parameter=none
## + Fold10: parameter=none
## - Fold10: parameter=none
## Aggregating results
## Fitting final model on full training set
# Print model2
model2
## Generalized Linear Model
##
## 699 samples
## 9 predictor
## 2 classes: 'benign', 'malignant'
##
## Pre-processing: median imputation (9), centered (9), scaled (9)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 629, 628, 630, 629, 629, 629, ...
## Resampling results:
##
## ROC Sens Spec
## 0.9917448 0.9694203 0.9418333
data(BloodBrain, package="caret") # produces logBBB (y) and bbbDescr (x)
bloodbrain_y <- logBBB
keyNames <- c( 'tpsa', 'nbasic', 'vsa_hyd', 'a_aro', 'weight', 'peoe_vsa.0', 'peoe_vsa.1', 'peoe_vsa.2', 'peoe_vsa.3', 'peoe_vsa.4', 'peoe_vsa.5', 'peoe_vsa.6', 'peoe_vsa.0.1', 'peoe_vsa.1.1', 'peoe_vsa.2.1', 'peoe_vsa.3.1', 'peoe_vsa.4.1', 'peoe_vsa.5.1', 'peoe_vsa.6.1', 'a_acc', 'a_acid', 'a_base', 'vsa_acc', 'vsa_acid', 'vsa_base', 'vsa_don', 'vsa_other', 'vsa_pol', 'slogp_vsa0', 'slogp_vsa1', 'slogp_vsa2', 'slogp_vsa3', 'slogp_vsa4', 'slogp_vsa5', 'slogp_vsa6', 'slogp_vsa7', 'slogp_vsa8', 'slogp_vsa9', 'smr_vsa0', 'smr_vsa1', 'smr_vsa2', 'smr_vsa3', 'smr_vsa4', 'smr_vsa5', 'smr_vsa6', 'smr_vsa7', 'tpsa.1', 'logp.o.w.', 'frac.anion7.', 'frac.cation7.', 'andrewbind', 'rotatablebonds', 'mlogp', 'clogp', 'mw', 'nocount', 'hbdnr', 'rule.of.5violations', 'prx', 'ub', 'pol', 'inthb', 'adistm', 'adistd', 'polar_area', 'nonpolar_area', 'psa_npsa', 'tcsa', 'tcpa', 'tcnp', 'ovality', 'surface_area', 'volume', 'most_negative_charge', 'most_positive_charge', 'sum_absolute_charge', 'dipole_moment', 'homo', 'lumo', 'hardness', 'ppsa1', 'ppsa2', 'ppsa3', 'pnsa1', 'pnsa2', 'pnsa3', 'fpsa1', 'fpsa2', 'fpsa3', 'fnsa1', 'fnsa2', 'fnsa3', 'wpsa1', 'wpsa2', 'wpsa3', 'wnsa1', 'wnsa2', 'wnsa3', 'dpsa1', 'dpsa2', 'dpsa3', 'rpcg', 'rncg', 'wpcs', 'wncs', 'sadh1', 'sadh2', 'sadh3', 'chdh1', 'chdh2', 'chdh3', 'scdh1', 'scdh2', 'scdh3', 'saaa1', 'saaa2', 'saaa3', 'chaa1', 'chaa2', 'chaa3', 'scaa1', 'scaa2', 'scaa3', 'ctdh', 'ctaa', 'mchg', 'achg', 'rdta', 'n_sp2', 'n_sp3', 'o_sp2', 'o_sp3' )
bloodbrain_x <- bbbDescr[, keyNames]
dim(bloodbrain_x)
## [1] 208 132
# Identify near zero variance predictors: remove_cols
remove_cols <- caret::nearZeroVar(bloodbrain_x, names = TRUE,
freqCut = 2, uniqueCut = 20)
# Get all column names from bloodbrain_x: all_cols
all_cols <- names(bloodbrain_x)
# Remove from data: bloodbrain_x_small
bloodbrain_x_small <- bloodbrain_x[ , setdiff(all_cols, remove_cols)]
# Fit model on reduced data: model
model <- caret::train(x = bloodbrain_x_small, y = bloodbrain_y, method = "glm")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
# Print model to console
model
## Generalized Linear Model
##
## 208 samples
## 112 predictors
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 208, 208, 208, 208, 208, 208, ...
## Resampling results:
##
## RMSE Rsquared
## 1.782164 0.1089338
# Fit glm model using PCA: model
model <- caret::train(
x = bloodbrain_x, y = bloodbrain_y,
method = "glm", preProcess = "pca"
)
# Print model to console
model
## Generalized Linear Model
##
## 208 samples
## 132 predictors
##
## Pre-processing: principal component signal extraction (132),
## centered (132), scaled (132)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 208, 208, 208, 208, 208, 208, ...
## Resampling results:
##
## RMSE Rsquared
## 0.6397498 0.4168059
Chapter 5 - Selecting Models Case Study (Customer Churn)
Reusing a trainControl - to compare apples to apples, make sure that all the models use the same training/test splits:
Reintroduce glmnet - linear model with built-in variable selection:
Reintroduce random forest - often the second model to try on a new predictive model:
Comparing models - assess the quality of the predictions (apples to apples is gained by using the same test=train splits on the data):
More on resamples - many great methods and inspired the caretEnsembles package:
Example code includes:
data(churn, package="C50")
sum(is.na(churnTrain)) # 0
## [1] 0
dim(churnTrain) # 3333 x 20
## [1] 3333 20
keyStateNums <- c( 5, 4, 8, 3, 4, 3, 4, 3, 5, 4, 4, 3, 3, 9, 4, 1, 9, 8, 4, 3, 4, 5, 4, 6, 6,
1, 4, 3, 3, 8, 4, 7, 5, 5, 6, 7, 4, 6, 5, 6, 9, 3, 5, 4, 6, 11, 2, 4, 2, 9, 5
)
keyStateNames <- c( 'AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 'HI', 'IA',
'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MI', 'MN', 'MO', 'MS',
'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA',
'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY'
)
keyIdx <- integer(0)
for (eachState in keyStateNames) {
keyIdx <- c(keyIdx,
sort(sample(as.integer(row.names(churnTrain[churnTrain$state == eachState, ])),
size=keyStateNums[match(eachState, keyStateNames)], replace=FALSE
)
)
)
}
churn_x <- churnTrain[keyIdx, ] %>%
mutate(international_planyes=as.integer(international_plan=="yes"),
area_codearea_code_415=as.integer(area_code=="area_code_415"),
area_codearea_code_510=as.integer(area_code=="area_code_510"),
voice_mail_planyes=as.integer(voice_mail_plan=="yes")
) %>%
select(-c(state, churn, area_code, international_plan, voice_mail_plan))
churn_y <- factor(churnTrain[keyIdx, "churn"], levels=c("no", "yes"))
stateCols <- matrix(data=0L, nrow=sum(keyStateNums), ncol=length(keyStateNums))
curCol <- 1
curRow <- 1
for (intCtr in cumsum(keyStateNums)) {
stateCols[curRow:intCtr, curCol] <- 1L
curCol <- curCol + 1
curRow <- intCtr + 1
}
stateDF <- as.data.frame(stateCols)
names(stateDF) <- paste0("state", keyStateNames)
churn_x <- cbind(churn_x, stateDF)
# Create custom indices: myFolds
myFolds <- caret::createFolds(churn_y, k = 5)
# Create reusable trainControl object: myControl
myControl <- caret::trainControl(
summaryFunction = twoClassSummary,
classProbs = TRUE, # IMPORTANT!
verboseIter = TRUE,
savePredictions = TRUE,
index = myFolds
)
# Fit glmnet model: model_glmnet
model_glmnet <- caret::train(
x = churn_x, y = churn_y,
metric = "ROC",
method = "glmnet",
trControl = myControl
)
## Loading required package: glmnet
## Loading required package: Matrix
## Loading required package: foreach
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loaded glmnet 2.0-10
## + Fold1: alpha=0.10, lambda=0.02283
## - Fold1: alpha=0.10, lambda=0.02283
## + Fold1: alpha=0.55, lambda=0.02283
## - Fold1: alpha=0.55, lambda=0.02283
## + Fold1: alpha=1.00, lambda=0.02283
## - Fold1: alpha=1.00, lambda=0.02283
## + Fold2: alpha=0.10, lambda=0.02283
## - Fold2: alpha=0.10, lambda=0.02283
## + Fold2: alpha=0.55, lambda=0.02283
## - Fold2: alpha=0.55, lambda=0.02283
## + Fold2: alpha=1.00, lambda=0.02283
## - Fold2: alpha=1.00, lambda=0.02283
## + Fold3: alpha=0.10, lambda=0.02283
## - Fold3: alpha=0.10, lambda=0.02283
## + Fold3: alpha=0.55, lambda=0.02283
## - Fold3: alpha=0.55, lambda=0.02283
## + Fold3: alpha=1.00, lambda=0.02283
## - Fold3: alpha=1.00, lambda=0.02283
## + Fold4: alpha=0.10, lambda=0.02283
## - Fold4: alpha=0.10, lambda=0.02283
## + Fold4: alpha=0.55, lambda=0.02283
## - Fold4: alpha=0.55, lambda=0.02283
## + Fold4: alpha=1.00, lambda=0.02283
## - Fold4: alpha=1.00, lambda=0.02283
## + Fold5: alpha=0.10, lambda=0.02283
## - Fold5: alpha=0.10, lambda=0.02283
## + Fold5: alpha=0.55, lambda=0.02283
## - Fold5: alpha=0.55, lambda=0.02283
## + Fold5: alpha=1.00, lambda=0.02283
## - Fold5: alpha=1.00, lambda=0.02283
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 1, lambda = 0.0228 on full training set
# Fit random forest: model_rf
model_rf <- caret::train(
x = churn_x, y = churn_y,
metric = "ROC",
method = "ranger",
trControl = myControl
)
## + Fold1: mtry= 2
## - Fold1: mtry= 2
## + Fold1: mtry=36
## - Fold1: mtry=36
## + Fold1: mtry=70
## - Fold1: mtry=70
## + Fold2: mtry= 2
## - Fold2: mtry= 2
## + Fold2: mtry=36
## - Fold2: mtry=36
## + Fold2: mtry=70
## - Fold2: mtry=70
## + Fold3: mtry= 2
## - Fold3: mtry= 2
## + Fold3: mtry=36
## - Fold3: mtry=36
## + Fold3: mtry=70
## - Fold3: mtry=70
## + Fold4: mtry= 2
## - Fold4: mtry= 2
## + Fold4: mtry=36
## - Fold4: mtry=36
## + Fold4: mtry=70
## - Fold4: mtry=70
## + Fold5: mtry= 2
## - Fold5: mtry= 2
## + Fold5: mtry=36
## - Fold5: mtry=36
## + Fold5: mtry=70
## - Fold5: mtry=70
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 36 on full training set
# Create model_list
model_list <- list(item1 = model_glmnet, item2 = model_rf)
# Pass model_list to resamples(): resamples
resamps <- caret::resamples(model_list)
# Summarize the results
summary(resamps)
##
## Call:
## summary.resamples(object = resamps)
##
## Models: item1, item2
## Number of resamples: 5
##
## ROC
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## item1 0.5436 0.5901 0.5959 0.6107 0.6133 0.7107 0
## item2 0.6649 0.6705 0.6824 0.6884 0.7113 0.7127 0
##
## Sens
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## item1 0.9264 0.9451 0.9509 0.9534 0.9571 0.9877 0
## item2 0.9387 0.9634 0.9693 0.9681 0.9816 0.9877 0
##
## Spec
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## item1 0.08108 0.08108 0.1081 0.1417 0.2162 0.2222 0
## item2 0.13510 0.18920 0.2222 0.2066 0.2432 0.2432 0
# Create bwplot
bwplot(resamps, metric="ROC")
# Create xyplot
xyplot(resamps, metric="ROC")
# Create ensemble model: stack
# Crashes out on my machine; not sure why . . .
# stack <- caretEnsemble::caretStack(model_list, method="glm")
# Look at summary
# summary(stack)
Chapter 1 - Jumping in
What is text mining? The process of distilling actionable insights from text:
Getting started - “bag of words” did not care about word types, so verbs and conjections and the like are treated the same as nouns:
Cleaning and pre-processing text - common pre-processing functions:
TDM (term-document matrix) and DTM (document-term matrix):
Example code includes:
library(qdap) # Will require R 3.3.1 or higher for dependency "slam"
## Loading required package: qdapDictionaries
## Loading required package: qdapRegex
##
## Attaching package: 'qdapRegex'
## The following object is masked from 'package:dplyr':
##
## explain
## The following object is masked from 'package:ggplot2':
##
## %+%
## Loading required package: qdapTools
##
## Attaching package: 'qdapTools'
## The following object is masked from 'package:dplyr':
##
## id
##
## Attaching package: 'qdap'
## The following object is masked from 'package:Matrix':
##
## %&%
## The following object is masked from 'package:purrr':
##
## %>%
## The following object is masked from 'package:dplyr':
##
## %>%
## The following object is masked from 'package:base':
##
## Filter
new_text <- "DataCamp is the first online learning platform that focuses on building the best learning experience specifically for Data Science. We have offices in Boston and Belgium and to date, we trained over 250,000 (aspiring) data scientists in over 150 countries. These data science enthusiasts completed more than 9 million exercises. You can take free beginner courses, or subscribe for $25/month to get access to all premium courses."
# Print new_text to the console
new_text
## [1] "DataCamp is the first online learning platform that focuses on building the best learning experience specifically for Data Science. We have offices in Boston and Belgium and to date, we trained over 250,000 (aspiring) data scientists in over 150 countries. These data science enthusiasts completed more than 9 million exercises. You can take free beginner courses, or subscribe for $25/month to get access to all premium courses."
# Find the 10 most frequent terms: term_count
term_count <- qdap::freq_terms(new_text, 10)
# Plot term_count
plot(term_count)
# Import text data
rawTweets <- read.csv("BagOfWordsTweetData_v001.csv", stringsAsFactors=FALSE)
str(rawTweets)
## 'data.frame': 1000 obs. of 2 variables:
## $ Coffee : chr " @ayyytylerb that is so true drink lots of coffee" " RT @bryzy_brib: Senior March tmw morning at 7:25 A.M. in the SENIOR lot. Get up early, make yo coffee/breakfast, cus this will"| __truncated__ " If you believe in #gunsense tomorrow would be a very good day to have your coffee any place BUT @Starbucks Guns+Coffee=#nosens"| __truncated__ " My cute coffee mug. http://t.co/2udvMU6XIG" ...
## $ Chardonnay: chr " RT @oceanclub: @eilisohanlon @stonyjim @vonprond Eilis, I'm from Pearse St and even I can tell a Chardonnay from so?" " ?@roystbaggage: 'Go to your Auntie Chardonnay and she will help you piss up against that wall' - the scum of Dover.?what's thi"| __truncated__ " Big thank you to Ian at Fowles wine for making me a Chardonnay drinker. @LadiesWhoShoot #wrongwayround http://t.co/KiA2StsOEO" " RT @oceanclub: @eilisohanlon @stonyjim @vonprond Eilis, I'm from Pearse St and even I can tell a Chardonnay from so?" ...
# Isolate coffee text from tweets: coffee_tweets
coffee_tweets <- rawTweets$Coffee
# Load tm
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:qdap':
##
## ngrams
## The following object is masked from 'package:ggplot2':
##
## annotate
##
## Attaching package: 'tm'
## The following objects are masked from 'package:qdap':
##
## as.DocumentTermMatrix, as.TermDocumentMatrix
# Make a vector source: coffee_source
coffee_source <- VectorSource(coffee_tweets)
# Make a volatile corpus: coffee_corpus
coffee_corpus <- VCorpus(coffee_source)
# Print out coffee_corpus
coffee_corpus
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 1000
# Print data on the 15th tweet in coffee_corpus
coffee_corpus[[15]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 112
# Print the content of the 15th tweet in coffee_corpus
coffee_corpus[[15]]$content
## [1] " @HeatherWhaley I was about 2 joke it takes 2 hands to hold hot coffee...then I read headline! #Don'tDrinkNShoot"
example_text <- data.frame(num=1:3, Author1=c('Text mining is a great time.', 'Text analysis provides insights', 'qdap and tm are used in text mining'), Author2=c('R is a great language', 'R has many uses', 'DataCamp is cool!'), stringsAsFactors=FALSE)
# Print example_text to the console
example_text
## num Author1 Author2
## 1 1 Text mining is a great time. R is a great language
## 2 2 Text analysis provides insights R has many uses
## 3 3 qdap and tm are used in text mining DataCamp is cool!
# Create a DataframeSource on columns 2 and 3: df_source
df_source <- tm::DataframeSource(example_text[,-1])
# Convert df_source to a corpus: df_corpus
df_corpus <- tm::VCorpus(df_source)
# Examine df_corpus
df_corpus
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 3
# Create a VectorSource on column 3: vec_source
vec_source <- tm::VectorSource(example_text[, 3])
# Convert vec_source to a corpus: vec_corpus
vec_corpus <- tm::VCorpus(vec_source)
# Examine vec_corpus
vec_corpus
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 3
# Create the object: text
text <- "<b>She</b> woke up at 6 A.M. It\'s so early! She was only 10% awake and began drinking coffee in front of her computer."
# All lowercase
tolower(text)
## [1] "<b>she</b> woke up at 6 a.m. it's so early! she was only 10% awake and began drinking coffee in front of her computer."
# Remove punctuation
tm::removePunctuation(text)
## [1] "bSheb woke up at 6 AM Its so early She was only 10 awake and began drinking coffee in front of her computer"
# Remove numbers
tm::removeNumbers(text)
## [1] "<b>She</b> woke up at A.M. It's so early! She was only % awake and began drinking coffee in front of her computer."
# Remove whitespace
tm::stripWhitespace(text)
## [1] "<b>She</b> woke up at 6 A.M. It's so early! She was only 10% awake and began drinking coffee in front of her computer."
# Remove text within brackets
bracketX(text)
## [1] "She woke up at 6 A.M. It's so early! She was only 10% awake and began drinking coffee in front of her computer."
# Replace numbers with words
replace_number(text)
## [1] "<b>She</b> woke up at six A.M. It's so early! She was only ten% awake and began drinking coffee in front of her computer."
# Replace abbreviations
replace_abbreviation(text)
## [1] "<b>She</b> woke up at 6 AM It's so early! She was only 10% awake and began drinking coffee in front of her computer."
# Replace contractions
replace_contraction(text)
## [1] "<b>She</b> woke up at 6 A.M. it is so early! She was only 10% awake and began drinking coffee in front of her computer."
# Replace symbols with words
replace_symbol(text)
## [1] "<b>She</b> woke up at 6 A.M. It's so early! She was only 10 percent awake and began drinking coffee in front of her computer."
# List standard English stop words
tm::stopwords("en")
## [1] "i" "me" "my" "myself" "we"
## [6] "our" "ours" "ourselves" "you" "your"
## [11] "yours" "yourself" "yourselves" "he" "him"
## [16] "his" "himself" "she" "her" "hers"
## [21] "herself" "it" "its" "itself" "they"
## [26] "them" "their" "theirs" "themselves" "what"
## [31] "which" "who" "whom" "this" "that"
## [36] "these" "those" "am" "is" "are"
## [41] "was" "were" "be" "been" "being"
## [46] "have" "has" "had" "having" "do"
## [51] "does" "did" "doing" "would" "should"
## [56] "could" "ought" "i'm" "you're" "he's"
## [61] "she's" "it's" "we're" "they're" "i've"
## [66] "you've" "we've" "they've" "i'd" "you'd"
## [71] "he'd" "she'd" "we'd" "they'd" "i'll"
## [76] "you'll" "he'll" "she'll" "we'll" "they'll"
## [81] "isn't" "aren't" "wasn't" "weren't" "hasn't"
## [86] "haven't" "hadn't" "doesn't" "don't" "didn't"
## [91] "won't" "wouldn't" "shan't" "shouldn't" "can't"
## [96] "cannot" "couldn't" "mustn't" "let's" "that's"
## [101] "who's" "what's" "here's" "there's" "when's"
## [106] "where's" "why's" "how's" "a" "an"
## [111] "the" "and" "but" "if" "or"
## [116] "because" "as" "until" "while" "of"
## [121] "at" "by" "for" "with" "about"
## [126] "against" "between" "into" "through" "during"
## [131] "before" "after" "above" "below" "to"
## [136] "from" "up" "down" "in" "out"
## [141] "on" "off" "over" "under" "again"
## [146] "further" "then" "once" "here" "there"
## [151] "when" "where" "why" "how" "all"
## [156] "any" "both" "each" "few" "more"
## [161] "most" "other" "some" "such" "no"
## [166] "nor" "not" "only" "own" "same"
## [171] "so" "than" "too" "very"
# Print text without standard stop words
tm::removeWords(text, tm::stopwords("en"))
## [1] "<b>She</b> woke 6 A.M. It's early! She 10% awake began drinking coffee front computer."
# Add "coffee" and "bean" to the list: new_stops
new_stops <- c("coffee", "bean", tm::stopwords("en"))
# Remove stop words from text
tm::removeWords(text, new_stops)
## [1] "<b>She</b> woke 6 A.M. It's early! She 10% awake began drinking front computer."
# Create complicate
complicate <- c("complicated", "complication", "complicatedly")
# Perform word stemming: stem_doc
stem_doc <- tm::stemDocument(complicate)
# Create the completion dictionary: comp_dict
comp_dict <- "complicate"
# Perform stem completion: complete_text
complete_text <- tm::stemCompletion(stem_doc, comp_dict)
# Print complete_text
complete_text
## complic complic complic
## "complicate" "complicate" "complicate"
# NEED FULL DICTIONARIES FOR THESE
# Remove punctuation: rm_punc
# rm_punc <- tm::removePunctuation(text_doc)
# Create character vector: n_char_vec
# n_char_vec <- unlist(strsplit(rm_punc, split = ' '))
# Perform word stemming: stem_doc
# stem_doc <- tm::stemDocument(n_char_vec)
# Print stem_doc
# stem_doc
# Re-complete stemmed document: complete_doc
# complete_doc <- tm::stemCompletion(stem_doc, comp_dict)
# Print complete_doc
# complete_doc
### DO NOT HAVE THE TWEET_CORP FILE (probably the coffee tweets corpus mentioned above)
# Alter the function code to match the instructions
clean_corpus <- function(corpus){
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeWords, c(stopwords("en"), "coffee", "mug"))
return(corpus)
}
# Apply your customized function to the tweet_corp: clean_corp
# Applied to coffee_corpus instead
clean_corp <- clean_corpus(coffee_corpus)
# Print out a cleaned up tweet
clean_corp[[227]][1]
## $content
## [1] " also dogs arent smart enough dip donut eat part thats dipped ladyandthetramp"
# Print out the same tweet in original form
coffee_tweets[227]
## [1] " Also, dogs aren't smart enough to dip the donut in the coffee and then eat the part that's been dipped. #ladyandthetramp"
# Create the dtm from the corpus: coffee_dtm
coffee_dtm <- DocumentTermMatrix(clean_corp)
# Print out coffee_dtm data
coffee_dtm
## <<DocumentTermMatrix (documents: 1000, terms: 3075)>>
## Non-/sparse entries: 7384/3067616
## Sparsity : 100%
## Maximal term length: 27
## Weighting : term frequency (tf)
# Convert coffee_dtm to a matrix: coffee_m
coffee_m <- as.matrix(coffee_dtm)
# Print the dimensions of coffee_m
dim(coffee_m)
## [1] 1000 3075
# Review a portion of the matrix
coffee_m[148:150, 2587:2590]
## Terms
## Docs stampedeblue stand star starbucks
## 148 0 0 0 0
## 149 0 0 0 0
## 150 0 0 0 0
# Create a TDM from clean_corp: coffee_tdm
coffee_tdm <- TermDocumentMatrix(clean_corp)
# Print coffee_tdm data
coffee_tdm
## <<TermDocumentMatrix (terms: 3075, documents: 1000)>>
## Non-/sparse entries: 7384/3067616
## Sparsity : 100%
## Maximal term length: 27
## Weighting : term frequency (tf)
# Convert coffee_tdm to a matrix: coffee_m
coffee_m <- as.matrix(coffee_tdm)
# Print the dimensions of the matrix
dim(coffee_m)
## [1] 3075 1000
# Review a portion of the matrix
coffee_m[2587:2590, 148:150]
## Docs
## Terms 148 149 150
## stampedeblue 0 0 0
## stand 0 0 0
## star 0 0 0
## starbucks 0 0 0
Chapter 2 - Word Clouds and Visuals
Common text mining visuals - good visualizations help with making quick conclusions:
Introduction to word clouds - more popular for of word plot, with size typically defaulting to frequency:
Other word clouds and word networks:
Example code includes:
# Create a matrix: coffee_m
coffee_m <- as.matrix(coffee_tdm)
# Calculate the rowSums: term_frequency
term_frequency <- rowSums(coffee_m)
# Sort term_frequency in descending order
term_frequency <- sort(term_frequency, decreasing=TRUE)
# View the top 10 most common words
term_frequency[1:10]
## like cup shop just get morning want drinking
## 111 103 69 66 62 57 49 47
## can looks
## 45 45
# Plot a barchart of the 10 most common words
barplot(term_frequency[1:10], las=2, col="tan")
# Create frequency
frequency <- qdap::freq_terms(coffee_tweets, top=10, at.least=3, stopwords="Top200Words")
# Make a frequency barchart
plot(frequency)
# Create frequency2
frequency2 <- qdap::freq_terms(coffee_tweets, top=10, at.least=3, stopwords=tm::stopwords("english"))
# Make a frequency2 barchart
plot(frequency2)
# Creating a smaller version of the second term_frequency file (only words with 5+ appearances)
term_frequency <- c( 824, 104, 83, 76, 75, 63, 52, 47, 43, 35, 34, 32, 32, 25, 24, 24, 24, 24, 23, 23, 23, 22, 22, 22, 21, 21, 21, 21, 21, 21, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5 )
names(term_frequency) <- c( 'chardonnay', 'marvin', 'wine', 'gaye', 'just', 'glass', 'like', 'bottle', 'lol', 'little', 'rose', 'dont', 'get', 'now', 'ass', 'can', 'know', 'love', 'drink', 'good', 'will', 'girl', 'night', 'time', 'cabernet', 'chocolate', 'still', 'thats', 'think', 'unoaked', 'well', 'milkshake', 'see', 'big', 'double', 'fists', 'inspired', 'jinkx', 'jinkxmonsoon', 'polite', 'really', 'better', 'dinner', 'got', 'httptcodudylkw', 'charles', 'fine', 'full', 'mood', 'nice', 'shiraz', 'day', 'drinking', 'naked', 'pwcwines', 'set', 'white', 'chicken', 'fancy', 'need', 'winning', 'always', 'beauty', 'board', 'bushes', 'competition', 'donjon', 'fell', 'grace', 'meeting', 'moms', 'one', 'pinot', 'porch', 'remember', 'wait', 'wanna', 'wonderwines', 'yall', 'best', 'https', 'januaryjames', 'live', 'make', 'new', 'pretty', 'right', 'way', 'chipotletwins', 'happy', 'name', 'oaked', 'old', 'shit', 'sippchardonnay', 'tasting', 'thanks', 'call', 'called', 'going', 'great', 'people', 'say', 'try', 'want', 'yes', 'brought', 'cant', 'first', 'fourvines', 'lot', 'noir', 'tell', 'today', 'tonight', 'video', 'winewednesday', 'cake', 'check', 'cute', 'even', 'game', 'jason', 'last', 'miss', 'mzchardonnay', 'sauce', 'sean', 'song', 'take', 'tho', 'valley', 'wines', 'around', 'away', 'back', 'bought', 'box', 'classy', 'cream', 'estate', 'fuck', 'gay', 'hey', 'home', 'ive', 'let', 'liked', 'lingerie', 'lmfaoo', 'mom', 'moscato', 'mushroom', 'please', 'rainbow', 'red', 'sauvignon', 'school', 'thank', 'vineyards', 'aint', 'beautiful', 'black', 'blue', 'boys', 'cheers', 'cool', 'dairy', 'food', 'goony', 'green', 'hoes', 'ill', 'ima', 'irishtexan', 'jamesthewineguy', 'lady', 'life', 'lil', 'listen', 'man', 'mantsoepout', 'mind', 'much', 'nah', 'qveenm', 'room', 'sippin', 'sipping', 'smh', 'text', 'thegamebet', 'veraison', 'asking', 'bcwine', 'bit', 'blanc', 'boity', 'buttery', 'chard', 'confessyourunpopularopinion', 'date', 'debortoliwines', 'drank', 'drunk', 'fruit', 'give', 'house', 'huntervalley', 'keep', 'ladieswhoshoot', 'late', 'lovely', 'month', 'never', 'notes', 'okay', 'paying', 'playing', 'question', 'seriously', 'simple', 'someone', 'started', 'stay', 'thought', 'ultimatebgc', 'vineyard', 'visit', 'walla', 'youre', 'also', 'answer', 'anytime', 'baby', 'bad', 'cause', 'citrus', 'come', 'crisp', 'ctfu', 'cyclone', 'delicious', 'dick', 'didnt', 'doesnt', 'enjoy', 'every', 'friend', 'funny', 'genay', 'glad', 'glasses', 'gonna', 'hes', 'hold', 'http', 'httptc', 'httptcoawdmglpmg', 'join', 'kinda', 'known', 'launches', 'may', 'michael', 'movies', 'next', 'paired', 'perfect', 'pinotnoir', 'point', 'poor', 'put', 'sadlife', 'said', 'salad', 'santa', 'scene', 'shout', 'special', 'stop', 'summer', 'tasha', 'work' )
# Load wordcloud package
library(wordcloud)
# Print the first 10 entries in term_frequency
term_frequency[1:10]
## chardonnay marvin wine gaye just glass
## 824 104 83 76 75 63
## like bottle lol little
## 52 47 43 35
# Create word_freqs
word_freqs <- data.frame(term=names(term_frequency), num=term_frequency)
# Create a wordcloud for the values in word_freqs
wordcloud(word_freqs$term, word_freqs$num, max.words=100, colors="red")
# Create chardonnay_corp
chardonnay_tweets <- rawTweets$Chardonnay
chardonnay_source <- VectorSource(chardonnay_tweets)
chardonnay_corp <- VCorpus(chardonnay_source)
# Add new stop words to clean_corpus()
clean_corpus <- function(corpus){
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeWords,
c(stopwords("en"), "amp", "chardonnay", "wine", "glass"))
return(corpus)
}
# Create clean_chardonnay
clean_chardonnay <- clean_corpus(chardonnay_corp)
# Create chardonnay_tdm
chardonnay_tdm <- TermDocumentMatrix(clean_chardonnay)
# Create chardonnay_m
chardonnay_m <- as.matrix(chardonnay_tdm)
# Create chardonnay_words
chardonnay_words <- rowSums(chardonnay_m)
# Copying over the portion of chardonnay_words where there is frequency of 5+
# chardonnay_words <- c( 7, 5, 14, 5, 5, 8, 6, 24, 8, 5, 8, 5, 6, 7, 14, 13, 18, 19, 6, 7, 6, 7, 14, 6, 47, 8, 8, 7, 10, 14, 6, 21, 9, 11, 11, 24, 10, 5, 6, 17, 9, 7, 15, 12, 21, 5, 8, 5, 14, 6, 7, 8, 5, 5, 9, 5, 7, 6, 16, 6, 5, 5, 5, 18, 5, 14, 32, 19, 6, 23, 16, 6, 5, 8, 9, 5, 15, 14, 17, 10, 19, 7, 10, 5, 6, 8, 17, 5, 9, 8, 76, 5, 32, 22, 6, 5, 5, 11, 5, 23, 7, 18, 14, 11, 7, 12, 5, 8, 7, 5, 8, 6, 5, 13, 5, 5, 18, 6, 7, 7, 19, 7, 8, 7, 13, 9, 19, 19, 5, 75, 6, 5, 24, 5, 6, 7, 9, 6, 5, 8, 7, 52, 8, 7, 8, 7, 35, 13, 8, 43, 10, 24, 6, 13, 7, 7, 104, 5, 14, 5, 20, 7, 9, 8, 14, 6, 17, 8, 5, 7, 8, 9, 7, 16, 12, 15, 6, 13, 5, 17, 22, 10, 6, 25, 12, 6, 12, 14, 5, 6, 11, 5, 14, 5, 6, 8, 5, 19, 5, 14, 13, 5, 16, 6, 7, 8, 19, 8, 14, 13, 7, 34, 5, 5, 5, 5, 9, 8, 11, 5, 8, 9, 20, 6, 16, 17, 12, 5, 6, 12, 7, 7, 7, 6, 9, 5, 6, 6, 21, 5, 5, 9, 5, 12, 10, 7, 8, 12, 21, 7, 21, 9, 6, 22, 10, 10, 11, 6, 21, 9, 7, 10, 6, 8, 6, 14, 6, 14, 11, 13, 21, 16, 23, 9, 10, 15, 14, 5, 14, 11, 6 )
# names(chardonnay_words) <- c( 'aint', 'also', 'always', 'answer', 'anytime', 'around', 'asking', 'ass', 'away', 'baby', 'back', 'bad', 'bcwine', 'beautiful', 'beauty', 'best', 'better', 'big', 'bit', 'black', 'blanc', 'blue', 'board', 'boity', 'bottle', 'bought', 'box', 'boys', 'brought', 'bushes', 'buttery', 'cabernet', 'cake', 'call', 'called', 'can', 'cant', 'cause', 'chard', 'charles', 'check', 'cheers', 'chicken', 'chipotletwins', 'chocolate', 'citrus', 'classy', 'come', 'competition', 'confessyourunpopularopinion', 'cool', 'cream', 'crisp', 'ctfu', 'cute', 'cyclone', 'dairy', 'date', 'day', 'debortoliwines', 'delicious', 'dick', 'didnt', 'dinner', 'doesnt', 'donjon', 'dont', 'double', 'drank', 'drink', 'drinking', 'drunk', 'enjoy', 'estate', 'even', 'every', 'fancy', 'fell', 'fine', 'first', 'fists', 'food', 'fourvines', 'friend', 'fruit', 'fuck', 'full', 'funny', 'game', 'gay', 'gaye', 'genay', 'get', 'girl', 'give', 'glad', 'glasses', 'going', 'gonna', 'good', 'goony', 'got', 'grace', 'great', 'green', 'happy', 'hes', 'hey', 'hoes', 'hold', 'home', 'house', 'http', 'https', 'httptc', 'httptcoawdmglpmg', 'httptcodudylkw', 'huntervalley', 'ill', 'ima', 'inspired', 'irishtexan', 'ive', 'jamesthewineguy', 'januaryjames', 'jason', 'jinkx', 'jinkxmonsoon', 'join', 'just', 'keep', 'kinda', 'know', 'known', 'ladieswhoshoot', 'lady', 'last', 'late', 'launches', 'let', 'life', 'like', 'liked', 'lil', 'lingerie', 'listen', 'little', 'live', 'lmfaoo', 'lol', 'lot', 'love', 'lovely', 'make', 'man', 'mantsoepout', 'marvin', 'may', 'meeting', 'michael', 'milkshake', 'mind', 'miss', 'mom', 'moms', 'month', 'mood', 'moscato', 'movies', 'much', 'mushroom', 'mzchardonnay', 'nah', 'naked', 'name', 'need', 'never', 'new', 'next', 'nice', 'night', 'noir', 'notes', 'now', 'oaked', 'okay', 'old', 'one', 'paired', 'paying', 'people', 'perfect', 'pinot', 'pinotnoir', 'playing', 'please', 'point', 'polite', 'poor', 'porch', 'pretty', 'put', 'pwcwines', 'question', 'qveenm', 'rainbow', 'really', 'red', 'remember', 'right', 'room', 'rose', 'sadlife', 'said', 'salad', 'santa', 'sauce', 'sauvignon', 'say', 'scene', 'school', 'sean', 'see', 'seriously', 'set', 'shiraz', 'shit', 'shout', 'simple', 'sippchardonnay', 'sippin', 'sipping', 'smh', 'someone', 'song', 'special', 'started', 'stay', 'still', 'stop', 'summer', 'take', 'tasha', 'tasting', 'tell', 'text', 'thank', 'thanks', 'thats', 'thegamebet', 'think', 'tho', 'thought', 'time', 'today', 'tonight', 'try', 'ultimatebgc', 'unoaked', 'valley', 'veraison', 'video', 'vineyard', 'vineyards', 'visit', 'wait', 'walla', 'wanna', 'want', 'way', 'well', 'white', 'will', 'wines', 'winewednesday', 'winning', 'wonderwines', 'work', 'yall', 'yes', 'youre' )
# Sort the chardonnay_words in descending order
chardonnay_words <- sort(chardonnay_words, decreasing=TRUE)
# Print the 6 most frequent chardonnay terms
sort(chardonnay_words, decreasing=TRUE)[1:6]
## marvin gaye just like bottle lol
## 104 76 75 52 47 43
# Create chardonnay_freqs
chardonnay_freqs <- data.frame(term=names(chardonnay_words), num=chardonnay_words)
# Create a wordcloud for the values in word_freqs
wordcloud(chardonnay_freqs$term, chardonnay_freqs$num, max.words=50, colors="red")
# Print the list of colors
colors()
## [1] "white" "aliceblue" "antiquewhite"
## [4] "antiquewhite1" "antiquewhite2" "antiquewhite3"
## [7] "antiquewhite4" "aquamarine" "aquamarine1"
## [10] "aquamarine2" "aquamarine3" "aquamarine4"
## [13] "azure" "azure1" "azure2"
## [16] "azure3" "azure4" "beige"
## [19] "bisque" "bisque1" "bisque2"
## [22] "bisque3" "bisque4" "black"
## [25] "blanchedalmond" "blue" "blue1"
## [28] "blue2" "blue3" "blue4"
## [31] "blueviolet" "brown" "brown1"
## [34] "brown2" "brown3" "brown4"
## [37] "burlywood" "burlywood1" "burlywood2"
## [40] "burlywood3" "burlywood4" "cadetblue"
## [43] "cadetblue1" "cadetblue2" "cadetblue3"
## [46] "cadetblue4" "chartreuse" "chartreuse1"
## [49] "chartreuse2" "chartreuse3" "chartreuse4"
## [52] "chocolate" "chocolate1" "chocolate2"
## [55] "chocolate3" "chocolate4" "coral"
## [58] "coral1" "coral2" "coral3"
## [61] "coral4" "cornflowerblue" "cornsilk"
## [64] "cornsilk1" "cornsilk2" "cornsilk3"
## [67] "cornsilk4" "cyan" "cyan1"
## [70] "cyan2" "cyan3" "cyan4"
## [73] "darkblue" "darkcyan" "darkgoldenrod"
## [76] "darkgoldenrod1" "darkgoldenrod2" "darkgoldenrod3"
## [79] "darkgoldenrod4" "darkgray" "darkgreen"
## [82] "darkgrey" "darkkhaki" "darkmagenta"
## [85] "darkolivegreen" "darkolivegreen1" "darkolivegreen2"
## [88] "darkolivegreen3" "darkolivegreen4" "darkorange"
## [91] "darkorange1" "darkorange2" "darkorange3"
## [94] "darkorange4" "darkorchid" "darkorchid1"
## [97] "darkorchid2" "darkorchid3" "darkorchid4"
## [100] "darkred" "darksalmon" "darkseagreen"
## [103] "darkseagreen1" "darkseagreen2" "darkseagreen3"
## [106] "darkseagreen4" "darkslateblue" "darkslategray"
## [109] "darkslategray1" "darkslategray2" "darkslategray3"
## [112] "darkslategray4" "darkslategrey" "darkturquoise"
## [115] "darkviolet" "deeppink" "deeppink1"
## [118] "deeppink2" "deeppink3" "deeppink4"
## [121] "deepskyblue" "deepskyblue1" "deepskyblue2"
## [124] "deepskyblue3" "deepskyblue4" "dimgray"
## [127] "dimgrey" "dodgerblue" "dodgerblue1"
## [130] "dodgerblue2" "dodgerblue3" "dodgerblue4"
## [133] "firebrick" "firebrick1" "firebrick2"
## [136] "firebrick3" "firebrick4" "floralwhite"
## [139] "forestgreen" "gainsboro" "ghostwhite"
## [142] "gold" "gold1" "gold2"
## [145] "gold3" "gold4" "goldenrod"
## [148] "goldenrod1" "goldenrod2" "goldenrod3"
## [151] "goldenrod4" "gray" "gray0"
## [154] "gray1" "gray2" "gray3"
## [157] "gray4" "gray5" "gray6"
## [160] "gray7" "gray8" "gray9"
## [163] "gray10" "gray11" "gray12"
## [166] "gray13" "gray14" "gray15"
## [169] "gray16" "gray17" "gray18"
## [172] "gray19" "gray20" "gray21"
## [175] "gray22" "gray23" "gray24"
## [178] "gray25" "gray26" "gray27"
## [181] "gray28" "gray29" "gray30"
## [184] "gray31" "gray32" "gray33"
## [187] "gray34" "gray35" "gray36"
## [190] "gray37" "gray38" "gray39"
## [193] "gray40" "gray41" "gray42"
## [196] "gray43" "gray44" "gray45"
## [199] "gray46" "gray47" "gray48"
## [202] "gray49" "gray50" "gray51"
## [205] "gray52" "gray53" "gray54"
## [208] "gray55" "gray56" "gray57"
## [211] "gray58" "gray59" "gray60"
## [214] "gray61" "gray62" "gray63"
## [217] "gray64" "gray65" "gray66"
## [220] "gray67" "gray68" "gray69"
## [223] "gray70" "gray71" "gray72"
## [226] "gray73" "gray74" "gray75"
## [229] "gray76" "gray77" "gray78"
## [232] "gray79" "gray80" "gray81"
## [235] "gray82" "gray83" "gray84"
## [238] "gray85" "gray86" "gray87"
## [241] "gray88" "gray89" "gray90"
## [244] "gray91" "gray92" "gray93"
## [247] "gray94" "gray95" "gray96"
## [250] "gray97" "gray98" "gray99"
## [253] "gray100" "green" "green1"
## [256] "green2" "green3" "green4"
## [259] "greenyellow" "grey" "grey0"
## [262] "grey1" "grey2" "grey3"
## [265] "grey4" "grey5" "grey6"
## [268] "grey7" "grey8" "grey9"
## [271] "grey10" "grey11" "grey12"
## [274] "grey13" "grey14" "grey15"
## [277] "grey16" "grey17" "grey18"
## [280] "grey19" "grey20" "grey21"
## [283] "grey22" "grey23" "grey24"
## [286] "grey25" "grey26" "grey27"
## [289] "grey28" "grey29" "grey30"
## [292] "grey31" "grey32" "grey33"
## [295] "grey34" "grey35" "grey36"
## [298] "grey37" "grey38" "grey39"
## [301] "grey40" "grey41" "grey42"
## [304] "grey43" "grey44" "grey45"
## [307] "grey46" "grey47" "grey48"
## [310] "grey49" "grey50" "grey51"
## [313] "grey52" "grey53" "grey54"
## [316] "grey55" "grey56" "grey57"
## [319] "grey58" "grey59" "grey60"
## [322] "grey61" "grey62" "grey63"
## [325] "grey64" "grey65" "grey66"
## [328] "grey67" "grey68" "grey69"
## [331] "grey70" "grey71" "grey72"
## [334] "grey73" "grey74" "grey75"
## [337] "grey76" "grey77" "grey78"
## [340] "grey79" "grey80" "grey81"
## [343] "grey82" "grey83" "grey84"
## [346] "grey85" "grey86" "grey87"
## [349] "grey88" "grey89" "grey90"
## [352] "grey91" "grey92" "grey93"
## [355] "grey94" "grey95" "grey96"
## [358] "grey97" "grey98" "grey99"
## [361] "grey100" "honeydew" "honeydew1"
## [364] "honeydew2" "honeydew3" "honeydew4"
## [367] "hotpink" "hotpink1" "hotpink2"
## [370] "hotpink3" "hotpink4" "indianred"
## [373] "indianred1" "indianred2" "indianred3"
## [376] "indianred4" "ivory" "ivory1"
## [379] "ivory2" "ivory3" "ivory4"
## [382] "khaki" "khaki1" "khaki2"
## [385] "khaki3" "khaki4" "lavender"
## [388] "lavenderblush" "lavenderblush1" "lavenderblush2"
## [391] "lavenderblush3" "lavenderblush4" "lawngreen"
## [394] "lemonchiffon" "lemonchiffon1" "lemonchiffon2"
## [397] "lemonchiffon3" "lemonchiffon4" "lightblue"
## [400] "lightblue1" "lightblue2" "lightblue3"
## [403] "lightblue4" "lightcoral" "lightcyan"
## [406] "lightcyan1" "lightcyan2" "lightcyan3"
## [409] "lightcyan4" "lightgoldenrod" "lightgoldenrod1"
## [412] "lightgoldenrod2" "lightgoldenrod3" "lightgoldenrod4"
## [415] "lightgoldenrodyellow" "lightgray" "lightgreen"
## [418] "lightgrey" "lightpink" "lightpink1"
## [421] "lightpink2" "lightpink3" "lightpink4"
## [424] "lightsalmon" "lightsalmon1" "lightsalmon2"
## [427] "lightsalmon3" "lightsalmon4" "lightseagreen"
## [430] "lightskyblue" "lightskyblue1" "lightskyblue2"
## [433] "lightskyblue3" "lightskyblue4" "lightslateblue"
## [436] "lightslategray" "lightslategrey" "lightsteelblue"
## [439] "lightsteelblue1" "lightsteelblue2" "lightsteelblue3"
## [442] "lightsteelblue4" "lightyellow" "lightyellow1"
## [445] "lightyellow2" "lightyellow3" "lightyellow4"
## [448] "limegreen" "linen" "magenta"
## [451] "magenta1" "magenta2" "magenta3"
## [454] "magenta4" "maroon" "maroon1"
## [457] "maroon2" "maroon3" "maroon4"
## [460] "mediumaquamarine" "mediumblue" "mediumorchid"
## [463] "mediumorchid1" "mediumorchid2" "mediumorchid3"
## [466] "mediumorchid4" "mediumpurple" "mediumpurple1"
## [469] "mediumpurple2" "mediumpurple3" "mediumpurple4"
## [472] "mediumseagreen" "mediumslateblue" "mediumspringgreen"
## [475] "mediumturquoise" "mediumvioletred" "midnightblue"
## [478] "mintcream" "mistyrose" "mistyrose1"
## [481] "mistyrose2" "mistyrose3" "mistyrose4"
## [484] "moccasin" "navajowhite" "navajowhite1"
## [487] "navajowhite2" "navajowhite3" "navajowhite4"
## [490] "navy" "navyblue" "oldlace"
## [493] "olivedrab" "olivedrab1" "olivedrab2"
## [496] "olivedrab3" "olivedrab4" "orange"
## [499] "orange1" "orange2" "orange3"
## [502] "orange4" "orangered" "orangered1"
## [505] "orangered2" "orangered3" "orangered4"
## [508] "orchid" "orchid1" "orchid2"
## [511] "orchid3" "orchid4" "palegoldenrod"
## [514] "palegreen" "palegreen1" "palegreen2"
## [517] "palegreen3" "palegreen4" "paleturquoise"
## [520] "paleturquoise1" "paleturquoise2" "paleturquoise3"
## [523] "paleturquoise4" "palevioletred" "palevioletred1"
## [526] "palevioletred2" "palevioletred3" "palevioletred4"
## [529] "papayawhip" "peachpuff" "peachpuff1"
## [532] "peachpuff2" "peachpuff3" "peachpuff4"
## [535] "peru" "pink" "pink1"
## [538] "pink2" "pink3" "pink4"
## [541] "plum" "plum1" "plum2"
## [544] "plum3" "plum4" "powderblue"
## [547] "purple" "purple1" "purple2"
## [550] "purple3" "purple4" "red"
## [553] "red1" "red2" "red3"
## [556] "red4" "rosybrown" "rosybrown1"
## [559] "rosybrown2" "rosybrown3" "rosybrown4"
## [562] "royalblue" "royalblue1" "royalblue2"
## [565] "royalblue3" "royalblue4" "saddlebrown"
## [568] "salmon" "salmon1" "salmon2"
## [571] "salmon3" "salmon4" "sandybrown"
## [574] "seagreen" "seagreen1" "seagreen2"
## [577] "seagreen3" "seagreen4" "seashell"
## [580] "seashell1" "seashell2" "seashell3"
## [583] "seashell4" "sienna" "sienna1"
## [586] "sienna2" "sienna3" "sienna4"
## [589] "skyblue" "skyblue1" "skyblue2"
## [592] "skyblue3" "skyblue4" "slateblue"
## [595] "slateblue1" "slateblue2" "slateblue3"
## [598] "slateblue4" "slategray" "slategray1"
## [601] "slategray2" "slategray3" "slategray4"
## [604] "slategrey" "snow" "snow1"
## [607] "snow2" "snow3" "snow4"
## [610] "springgreen" "springgreen1" "springgreen2"
## [613] "springgreen3" "springgreen4" "steelblue"
## [616] "steelblue1" "steelblue2" "steelblue3"
## [619] "steelblue4" "tan" "tan1"
## [622] "tan2" "tan3" "tan4"
## [625] "thistle" "thistle1" "thistle2"
## [628] "thistle3" "thistle4" "tomato"
## [631] "tomato1" "tomato2" "tomato3"
## [634] "tomato4" "turquoise" "turquoise1"
## [637] "turquoise2" "turquoise3" "turquoise4"
## [640] "violet" "violetred" "violetred1"
## [643] "violetred2" "violetred3" "violetred4"
## [646] "wheat" "wheat1" "wheat2"
## [649] "wheat3" "wheat4" "whitesmoke"
## [652] "yellow" "yellow1" "yellow2"
## [655] "yellow3" "yellow4" "yellowgreen"
# Print the wordcloud with the specified colors
wordcloud(chardonnay_freqs$term,
chardonnay_freqs$num,
max.words = 100,
colors = c("grey80", "darkgoldenrod1", "tomato")
)
# List the available colors
display.brewer.all()
# Create purple_orange
purple_orange <- brewer.pal(10, "PuOr")
# Drop 2 faintest colors
purple_orange <- purple_orange[-(1:2)]
# Create a wordcloud with purple_orange palette
wordcloud(chardonnay_freqs$term, chardonnay_freqs$num, max.words = 100, colors = purple_orange)
# Create all_coffee
all_coffee <- paste(coffee_tweets, collapse=" ")
# Create all_chardonnay
all_chardonnay <- paste(chardonnay_tweets, collapse=" ")
# Create all_tweets
all_tweets <- c(all_coffee, all_chardonnay)
# Convert to a vector source
all_tweets <- VectorSource(all_tweets)
# Create all_corpus
all_corpus <- VCorpus(all_tweets)
clean_corpus <- function(corpus){
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeWords, c(stopwords("en"), "amp", "glass", "chardonnay", "coffee"))
return(corpus)
}
# Clean the corpus
all_clean <- clean_corpus(all_corpus)
# Create all_tdm
all_tdm <- TermDocumentMatrix(all_clean)
# Create all_m
all_tdm_m <- as.matrix(all_tdm)
# Print a commonality cloud
commonality.cloud(all_tdm_m, colors="steelblue1", max.words=100)
# Clean the corpus
all_clean <- clean_corpus(all_corpus)
# Create all_tdm
all_tdm <- TermDocumentMatrix(all_clean)
# Give the columns distinct names
colnames(all_tdm) <- c("coffee", "chardonnay")
# Create all_m
all_tdm_m <- as.matrix(all_tdm)
# Create comparison cloud
comparison.cloud(all_tdm_m, colors=c("orange", "blue"), max.words=50)
# Create common_words
common_words <- subset(all_tdm_m, all_tdm_m[, 1] > 0 & all_tdm_m[, 2] > 0)
# Create difference
difference <- abs(common_words[, 1] - common_words[, 2])
# Combine common_words and difference
common_words <- cbind(common_words, difference)
# Order the data frame from most differences to least
common_words <- common_words[order(common_words[, 3], decreasing = TRUE), ]
# Create top25_df
top25_df <- data.frame(x = common_words[1:25, 1],
y = common_words[1:25, 2],
labels = rownames(common_words[1:25, ]))
# Create the pyramid plot
plotrix::pyramid.plot(top25_df$x, top25_df$y, labels=top25_df$labels,
gap=8, top.labels = c("Coffee", "Words", "Chardonnay"),
main = "Words in Common", laxlab = NULL,
raxlab = NULL, unit = NULL
)
## [1] 5.1 4.1 4.1 2.1
# Word association
word_associate(coffee_tweets, match.string = c("barista"),
stopwords = c(Top200Words, "coffee", "amp"),
network.plot = TRUE, cloud.colors = c("gray85", "darkred"))
## Warning in text2color(words = V(g)$label, recode.words = target.words,
## colors = label.colors): length of colors should be 1 more than length of
## recode.words
## row group unit text
## 1 544 all 544 RT @Barista_kyo: #coffee #latte #soylatte #thinkcoffee # # # # @ think coffee http://t.co/Hmy9RPRWTZ
## 2 569 all 569 RT @ReversoSmith: What a beautiful mess! #portafilter #coffee #espresso #coffeemachine #barista #baristalife? http://t.co/ZODcTfP22Z
## 3 658 all 658 The moment you realize your Starbucks barista gave you a regular iced Coffee when u asked 4 decaf. Shitty. Late night not planned.
## 4 931 all 931 Barista made my coffee wrong and still gave me both anyway #Starbucks #coffee #caffeine #upallnight http://t.co/iKCNwO8F6t
## 5 951 all 951 RT @FrankIero: hahaha @jamiasan :*gives Barista our Starbucks order* Barista: coffee? @jamiasan : yes, isn't this is a coffee store?
##
## Match Terms
## ===========
##
## List 1:
## baristakyo, barista, baristalife
##
# Add title
title(main = "Barista Coffee Tweet Associations")
Chapter 3 - Additional text mining (library tm) skills
Simple word clustering - hierarchical clustering and dendrograms (trees):
Getting past single words - considering “not” followed by “good” to have a very specific meaning, rather than just being a sentence containing “not” and “good”:
Different frequency criteria - frequent words can mask insights:
Example code includes:
rain <- data.frame(city=c( 'Cleveland', 'Portland', 'Boston', 'New Orleans' ),
rainfall=c( 39.14, 39.14, 43.77, 62.45 ),
stringsAsFactors=FALSE
)
str(rain)
## 'data.frame': 4 obs. of 2 variables:
## $ city : chr "Cleveland" "Portland" "Boston" "New Orleans"
## $ rainfall: num 39.1 39.1 43.8 62.5
# Create dist_rain
dist_rain <- dist(rain$rainfall)
# View the distance matrix
dist_rain
## 1 2 3
## 2 0.00
## 3 4.63 4.63
## 4 23.31 23.31 18.68
# Create hc
hc <- hclust(dist_rain)
# Plot hc
plot(hc, labels=rain$city)
# NEED TO DOUBLE CHECK EXISTENCE OF tweets_tdm
# Print the dimensions of tweets_tdm
tweets_tdm <- coffee_tdm
dim(tweets_tdm)
## [1] 3075 1000
# Create tdm1
tdm1 <- removeSparseTerms(tweets_tdm, sparse=0.95)
# Create tdm2
tdm2 <- removeSparseTerms(tweets_tdm, sparse=0.975)
# Print tdm1
tdm1
## <<TermDocumentMatrix (terms: 6, documents: 1000)>>
## Non-/sparse entries: 418/5582
## Sparsity : 93%
## Maximal term length: 7
## Weighting : term frequency (tf)
# Print tdm2
tdm2
## <<TermDocumentMatrix (terms: 40, documents: 1000)>>
## Non-/sparse entries: 1646/38354
## Sparsity : 96%
## Maximal term length: 13
## Weighting : term frequency (tf)
# Create tweets_tdm2
tweets_tdm2 <- removeSparseTerms(tweets_tdm, sparse=0.975)
# Create tdm_m
tdm_m <- as.matrix(tweets_tdm2)
# Create tdm_df
tdm_df <- as.data.frame(tdm_m)
# Create tweets_dist
tweets_dist <- dist(tdm_df)
# Create hc
hc <- hclust(tweets_dist)
# Plot the dendrogram
plot(hc)
# Load dendextend
library(dendextend)
##
## ---------------------
## Welcome to dendextend version 1.5.2
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## Or contact: <tal.galili@gmail.com>
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
## Attaching package: 'dendextend'
## The following object is masked from 'package:qdap':
##
## %>%
## The following object is masked from 'package:stats':
##
## cutree
# Create hc
hc <- hclust(tweets_dist)
# Create hcd
hcd <- as.dendrogram(hc)
# Print the labels in hcd
labels(hcd)
## [1] "cup" "like" "shop" "looks"
## [5] "show" "hgtv" "renovation" "charlie"
## [9] "hosting" "working" "portland" "movethesticks"
## [13] "whitehurst" "just" "get" "good"
## [17] "morning" "want" "tea" "drinking"
## [21] "can" "starbucks" "think" "iced"
## [25] "half" "chemicals" "cancer" "tested"
## [29] "1000" "single" "need" "ice"
## [33] "much" "amp" "now" "right"
## [37] "love" "make" "dont" "drink"
# Change the branch color to red for "marvin" and "gaye"
hcd <- branches_attr_by_labels(hcd, c("starbucks", "cup"), color="red")
# Plot hcd
plot(hcd)
# Add cluster rectangles
rect.dendrogram(hcd, k=2, border="grey50")
# Create hc
hc <- hclust(tweets_dist)
# Create hcd
hcd <- as.dendrogram(hc)
# Print the labels in hcd
labels(hcd)
## [1] "cup" "like" "shop" "looks"
## [5] "show" "hgtv" "renovation" "charlie"
## [9] "hosting" "working" "portland" "movethesticks"
## [13] "whitehurst" "just" "get" "good"
## [17] "morning" "want" "tea" "drinking"
## [21] "can" "starbucks" "think" "iced"
## [25] "half" "chemicals" "cancer" "tested"
## [29] "1000" "single" "need" "ice"
## [33] "much" "amp" "now" "right"
## [37] "love" "make" "dont" "drink"
# Change the branch color to red for "marvin" and "gaye"
hcd <- branches_attr_by_labels(hcd, c("cup", "just"), color="red")
# Plot hcd
plot(hcd)
# Add cluster rectangles
rect.dendrogram(hcd, k=2, border="grey50")
# Create associations
associations <- findAssocs(tweets_tdm, "venti", 0.2)
# View the venti associations
associations
## $venti
## breve drizzle entire pumps extra cuz forget
## 0.58 0.58 0.58 0.58 0.47 0.41 0.41
## okay hyper mocha vanilla wtf always asleep
## 0.41 0.33 0.33 0.33 0.29 0.26 0.26
## get starbucks white
## 0.25 0.25 0.23
# Create associations_df
associations_df <- list_vect2df(associations)[, 2:3]
# Plot the associations_df values (don't change this)
ggplot(associations_df, aes(y = associations_df[, 1])) +
geom_point(aes(x = associations_df[, 2]),
data = associations_df, size = 3) +
theme_gdocs()
# DOES NOT WORK ON MY MACHINE
# Make tokenizer function
tokenizer <- function(x)
RWeka::NGramTokenizer(x, RWeka::Weka_control(min = 2, max = 2))
text_corp <- clean_chardonnay
# Create unigram_dtm
unigram_dtm <- DocumentTermMatrix(text_corp)
# Create bigram_dtm
bigram_dtm <- DocumentTermMatrix(text_corp, control=list(tokenize=tokenizer))
# Examine unigram_dtm
unigram_dtm
## <<DocumentTermMatrix (documents: 1000, terms: 2979)>>
## Non-/sparse entries: 6986/2972014
## Sparsity : 100%
## Maximal term length: 27
## Weighting : term frequency (tf)
# Examine bigram_dtm
bigram_dtm
## <<DocumentTermMatrix (documents: 1000, terms: 4812)>>
## Non-/sparse entries: 6680/4805320
## Sparsity : 100%
## Maximal term length: 41
## Weighting : term frequency (tf)
# Create bigram_dtm_m
bigram_dtm_m <- as.matrix(bigram_dtm)
# Create freq
freq <- colSums(bigram_dtm_m)
# Create bi_words
bi_words <- names(freq)
# Examine part of bi_words
bi_words[2577:2587]
## [1] "mean liyah" "meaningless round"
## [3] "means bottles" "measure hamilton"
## [5] "meat piss" "meditation httptcoyjsysbuby"
## [7] "medium finish" "meds rare"
## [9] "meet anybody" "meet lot"
## [11] "meet three"
# Plot a wordcloud
wordcloud(bi_words, freq, max.words=15)
# Create tf_tdm
tf_tdm <- TermDocumentMatrix(text_corp)
# Create tfidf_tdm
tfidf_tdm <- TermDocumentMatrix(text_corp, control=list(weighting = weightTfIdf))
## Warning in weighting(x): empty document(s): 303 480 743
# Create tf_tdm_m
tf_tdm_m <- as.matrix(tf_tdm)
# Create tfidf_tdm_m
tfidf_tdm_m <- as.matrix(tfidf_tdm)
# Examine part of tf_tdm_m
tf_tdm_m[508:509, 5:10]
## Docs
## Terms 5 6 7 8 9 10
## corner 0 0 0 0 0 0
## corriander 0 0 0 0 0 0
# Examine part of tfidf_tdm_m
tf_tdm_m[508:509, 5:10]
## Docs
## Terms 5 6 7 8 9 10
## corner 0 0 0 0 0 0
## corriander 0 0 0 0 0 0
# DO NOT HAVE dataframe tweets
# Add author to custom reading list
custom_reader <- readTabular(mapping = list(content = "text",
id = "num",
author = "screenName",
date = "created"
))
# Make corpus with custom reading
# text_corpus <- VCorpus(DataframeSource(tweets), readerControl = list(reader = custom_reader))
# Clean corpus
# text_corpus <- clean_corpus(text_corpus)
# Print data
# text_corpus[[1]][1]
# Print metadata
# text_corpus[[1]][2]
Chapter 4 - Case study
Amazon vs Google case study - following the six key steps on an HR analytics project:
Step 3: Text Organization - creating an integrated qdapClean function:
Steps 4&5: Feature Extraction and Analysis - for example, sentiment scoring or bi-gram TDM:
Step 6: Reach a conclusion - end of the work flow:
Example code includes:
# Re-creating the data sets available in the case study
test <- read.csv("AmazonGoogleHRData_v001.csv",
stringsAsFactors=FALSE,
na.strings=c("NA", "NA ")
)
amzn <- subset(test, src=="amzn")
goog <- subset(test, src=="goog")
amzn$src <- NULL
goog$src <- NULL
# Print the structure of amzn
str(amzn)
## 'data.frame': 500 obs. of 4 variables:
## $ pg_num: int 50 50 50 50 50 50 50 50 50 50 ...
## $ url : chr "https://www.glassdoor.com/Reviews/Amazon-com-Reviews-E6036_P50.htm " "https://www.glassdoor.com/Reviews/Amazon-com-Reviews-E6036_P50.htm " "https://www.glassdoor.com/Reviews/Amazon-com-Reviews-E6036_P50.htm " "https://www.glassdoor.com/Reviews/Amazon-com-Reviews-E6036_P50.htm " ...
## $ pros : chr "You're surrounded by smart people and the projects are interesting, if a little daunting. " "Brand name is great. Have yet to meet somebody who is unfamiliar with Amazon. Hours weren't as bad as I had previously heard. B"| __truncated__ "Good money.Interaction with some great minds in the world during internal conferences and sessions.Of course the pride of being"| __truncated__ "nice pay and overtime and different shifts " ...
## $ cons : chr "Internal tools proliferation has created a mess for trying to get to basic information. Most people are required to learn/under"| __truncated__ "not the most stimulating work. Good brand name to work for but the work itself is mundane as it can get. As a financial analyst"| __truncated__ "No proper growth plan for employees.Difficult promotion process requiring a lot more documentation than your actual deliverable"| __truncated__ "didn't last quite long enough " ...
# Create amzn_pros
amzn_pros <- amzn$pros
# Create amzn_cons
amzn_cons <- amzn$cons
# Print the structure of goog
str(goog)
## 'data.frame': 500 obs. of 4 variables:
## $ pg_num: int 1 1 1 1 1 1 1 1 1 1 ...
## $ url : chr "https://www.glassdoor.com/Reviews/Google-Reviews-E9079_P1.htm " "https://www.glassdoor.com/Reviews/Google-Reviews-E9079_P1.htm " "https://www.glassdoor.com/Reviews/Google-Reviews-E9079_P1.htm " "https://www.glassdoor.com/Reviews/Google-Reviews-E9079_P1.htm " ...
## $ pros : chr "* If you're a software engineer, you're among the kings of the hill at Google. It's an engineer-driven company without a doubt "| __truncated__ "1) Food, food, food. 15+ cafes on main campus (MTV) alone. Mini-kitchens, snacks, drinks, free breakfast/lunch/dinner, all day,"| __truncated__ "You can't find a more well-regarded company that actually deserves the hype it gets. " "- you drive yourself here. If you want to grow, you have to seek out opportunities and prove that your worth. This keeps you mo"| __truncated__ ...
## $ cons : chr "* It *is* becoming larger, and with it comes growing pains: bureaucracy, slow to respond to market threats, bloated teams, cros"| __truncated__ "1) Work/life balance. What balance? All those perks and benefits are an illusion. They keep you at work and they help you to be"| __truncated__ "I live in SF so the commute can take between 1.5 hours to 1.75 hours each way on the shuttle - sometimes 2 hours each way on a "| __truncated__ "- Google is a big company. So there are going to be winners and losers when it comes to career growth. Due to the high hiring b"| __truncated__ ...
# Create goog_pros
goog_pros <- goog$pros
# Create goog_cons
goog_cons <- goog$cons
qdap_clean <- function(x){
x <- replace_abbreviation(x)
x <- replace_contraction(x)
x <- replace_number(x)
x <- replace_ordinal(x)
x <- replace_ordinal(x)
x <- replace_symbol(x)
x <- tolower(x)
return(x)
}
tm_clean <- function(corpus){
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeWords,
c(stopwords("en"), "Google", "Amazon", "company"))
return(corpus)
}
# Alter amzn_pros
amzn_pros <- qdap_clean(amzn_pros)
# Alter amzn_cons
amzn_cons <- qdap_clean(amzn_cons)
# Create az_p_corp
az_p_corp <- VCorpus(VectorSource(amzn_pros[complete.cases(amzn_pros)]))
# Create az_c_corp
az_c_corp <- VCorpus(VectorSource(amzn_cons[complete.cases(amzn_cons)]))
# Create amzn_pros_corp
amzn_pros_corp <- tm_clean(az_p_corp)
# Create amzn_cons_corp
amzn_cons_corp <- tm_clean(az_c_corp)
# Apply qdap_clean to goog_pros
goog_pros <- qdap_clean(goog_pros)
# Apply qdap_clean to goog_cons
goog_cons <- qdap_clean(goog_cons)
# Create goog_p_corp
# complete.cases() to avoid the NA problem in RWeka::NGramTokenizer
goog_p_corp <- VCorpus(VectorSource(goog_pros[complete.cases(goog_pros)]))
# Create goog_c_corp
# complete.cases() to avoid the NA problem in RWeka::NGramTokenizer
goog_c_corp <- VCorpus(VectorSource(goog_cons[complete.cases(goog_cons)]))
# Create goog_pros_corp
goog_pros_corp <- tm_clean(goog_p_corp)
# Create goog_cons_corp
goog_cons_corp <- tm_clean(goog_c_corp)
# DOES NOT WORK ON MY MACHINE (needed the complete.cases() fix above - seems to struggle with NA data)
tokenizer <- function(x) { RWeka::NGramTokenizer(x, RWeka::Weka_control(min=2, max=2)) }
# Create amzn_p_tdm
amzn_p_tdm <- TermDocumentMatrix(amzn_pros_corp, control=list(tokenize=tokenizer))
# Create amzn_p_tdm_m
amzn_p_tdm_m <- as.matrix(amzn_p_tdm)
# Create amzn_p_freq
amzn_p_freq <- rowSums(amzn_p_tdm_m)
# Plot a wordcloud using amzn_p_freq values
wordcloud(names(amzn_p_freq), amzn_p_freq, max.words=25, color="blue")
## Warning in wordcloud(names(amzn_p_freq), amzn_p_freq, max.words = 25, color
## = "blue"): good benefits could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(amzn_p_freq), amzn_p_freq, max.words = 25, color
## = "blue"): fast paced could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(amzn_p_freq), amzn_p_freq, max.words = 25, color
## = "blue"): smart people could not be fit on page. It will not be plotted.
# Create amzn_c_tdm
amzn_c_tdm <- TermDocumentMatrix(amzn_cons_corp, control=list(tokenize=tokenizer))
# Create amzn_c_tdm_m
amzn_c_tdm_m <- as.matrix(amzn_c_tdm)
# Create amzn_c_freq
amzn_c_freq <- rowSums(amzn_c_tdm_m)
# Plot a wordcloud of negative Amazon bigrams
wordcloud(names(amzn_c_freq), amzn_c_freq, max.words=25, color="red")
# Create amzn_c_tdm
amzn_c_tdm <- TermDocumentMatrix(amzn_cons_corp, control=list(tokenize=tokenizer))
# Print amzn_c_tdm to the console
amzn_c_tdm
## <<TermDocumentMatrix (terms: 4777, documents: 494)>>
## Non-/sparse entries: 5217/2354621
## Sparsity : 100%
## Maximal term length: 31
## Weighting : term frequency (tf)
# Create amzn_c_tdm2 by removing sparse terms
amzn_c_tdm2 <- removeSparseTerms(amzn_c_tdm, sparse=0.993)
# Create hc as a cluster of distance values
hc <- hclust(dist(amzn_c_tdm2, method="euclidean"), method="complete")
# Produce a plot of hc
plot(hc)
# Create amzn_p_tdm
amzn_p_tdm <- TermDocumentMatrix(amzn_pros_corp, control=list(tokenize=tokenizer))
# Create amzn_p_m
amzn_p_m <- as.matrix(amzn_p_tdm)
# Create amzn_p_freq
amzn_p_freq <- rowSums(amzn_p_m)
# Create term_frequency
term_frequency <- sort(amzn_p_freq, decreasing=TRUE)
# Print the 5 most common terms
term_frequency[1:5]
## good pay great benefits smart people place work fast paced
## 25 24 20 17 16
# Find associations with fast paced
findAssocs(amzn_p_tdm, "fast paced", 0.2)
## $`fast paced`
## paced environment environments ever learn fast
## 0.49 0.35 0.35
## paced friendly paced work able excel
## 0.35 0.35 0.25
## activity ample advance one also well
## 0.25 0.25 0.25
## amazon fast amazon noting amazon one
## 0.25 0.25 0.25
## amount time ample opportunity assistance ninety
## 0.25 0.25 0.25
## benefits including break computer call activity
## 0.25 0.25 0.25
## can choose catchy cheers center things
## 0.25 0.25 0.25
## challenging expect cheers opportunity choose success
## 0.25 0.25 0.25
## combined encouragement competitive environments computer room
## 0.25 0.25 0.25
## cool things deliver results dock makes
## 0.25 0.25 0.25
## driven deliver easy learn emphasis shipping
## 0.25 0.25 0.25
## encouragement innovation environment benefits environment catchy
## 0.25 0.25 0.25
## environment center environment fast environment help
## 0.25 0.25 0.25
## environment smart ever known ever witnessed
## 0.25 0.25 0.25
## everchanging fast everyones preferences excel advance
## 0.25 0.25 0.25
## excel everchanging exciting environment expect learn
## 0.25 0.25 0.25
## extremely fast facility top fail successful
## 0.25 0.25 0.25
## fantastic able fired part five percent
## 0.25 0.25 0.25
## freindly place friendly atmosphere friendly management
## 0.25 0.25 0.25
## full medical get fired go extremely
## 0.25 0.25 0.25
## great plenty great teamwork happening technology
## 0.25 0.25 0.25
## hassle benefits help get help workers
## 0.25 0.25 0.25
## high quality high volume including full
## 0.25 0.25 0.25
## innovation owning job requirements leader can
## 0.25 0.25 0.25
## line break lot responsibility maintaining high
## 0.25 0.25 0.25
## makes time management nice nice facility
## 0.25 0.25 0.25
## ninety five noting short offers opportunity
## 0.25 0.25 0.25
## one competitive one fast opportunity overtime
## 0.25 0.25 0.25
## opportunity yell ownership fast owning work
## 0.25 0.25 0.25
## paced emphasis paced exciting paced high
## 0.25 0.25 0.25
## paced never paced rewarding paced ship
## 0.25 0.25 0.25
## paced software paid upfront people focused
## 0.25 0.25 0.25
## percent paid plenty shifts position fast
## 0.25 0.25 0.25
## possible still preferences fast products quickly
## 0.25 0.25 0.25
## quality bar quickly possible readily available
## 0.25 0.25 0.25
## requirements easy responsibility ownership results great
## 0.25 0.25 0.25
## results team rewarding people shifts everyones
## 0.25 0.25 0.25
## ship dock shipping products short amount
## 0.25 0.25 0.25
## short fantastic smart coworkers still maintaining
## 0.25 0.25 0.25
## success fail successful also team driven
## 0.25 0.25 0.25
## technology today things happening things lot
## 0.25 0.25 0.25
## time fast time go top line
## 0.25 0.25 0.25
## upfront experience vision well volume call
## 0.25 0.25 0.25
## well rewarded well tuition witnessed combined
## 0.25 0.25 0.25
## work can work cool work environments
## 0.25 0.25 0.25
## work fast work job workers readily
## 0.25 0.25 0.25
## yell leader
## 0.25
# DO NOT HAVE FILE all_goog_corp
# Created below
goog_df <- data.frame(pros=goog_pros, cons=goog_cons)
goog_df <- goog_df[complete.cases(goog_df), ]
str(goog_df)
## 'data.frame': 499 obs. of 2 variables:
## $ pros: Factor w/ 491 levels "- access to a vast wealth of technical resources and people",..: 20 354 485 12 409 227 412 375 308 383 ...
## $ cons: Factor w/ 489 levels "- bureaucracy, politics, legal issues, and privacy handling take up more and more time over the years and slow innovation and d"| __truncated__,..: 17 308 170 6 289 56 451 445 180 107 ...
goog_vec <- c(paste(goog_df$pros, collapse=" "),
paste(goog_df$cons, collapse=" ")
)
all_goog_corpus <- VCorpus(VectorSource(goog_vec))
# Create all_goog_corp
all_goog_corp <- tm_clean(all_goog_corpus)
# Create all_tdm
all_tdm <- TermDocumentMatrix(all_goog_corp)
# Name the columns of all_tdm
colnames(all_tdm) <- c("Goog_Pros", "Goog_Cons")
# Create all_m
all_m <- as.matrix(all_tdm)
# Build a comparison cloud
comparison.cloud(all_m, max.words=100, colors=c("#F44336", "#2196f3"))
# DO NOT HAVE - THIS IS THE ALL POSITIVE ASSOCIATIONS
# Created below
goog_p_tdm <- TermDocumentMatrix(goog_pros_corp, control=list(tokenize=tokenizer))
goog_p_tdm_m <- as.matrix(goog_p_tdm)
goog_p_freq <- rowSums(goog_p_tdm_m)
all_tdm_df <- merge(y=data.frame(keyWord=names(goog_p_freq), googNum=goog_p_freq, stringsAsFactors=FALSE),
x=data.frame(keyWord=names(amzn_p_freq), amznNum=amzn_p_freq, stringsAsFactors=FALSE),
by="keyWord", all=TRUE
)
all_tdm_df[is.na(all_tdm_df)] <- 0
all_tdm_m <- as.matrix(all_tdm_df[, -1])
rownames(all_tdm_m) <- all_tdm_df$keyWord
# Create common_words
common_words <- subset(all_tdm_m, all_tdm_m[, 1] > 0 & all_tdm_m[, 2] > 0)
# Create difference
difference <- abs(common_words[, 2] - common_words[, 1])
# Add difference to common_words
common_words <- cbind(common_words, difference)
# Order the data frame from most differences to least
common_words <- common_words[order(common_words[, 3], decreasing=TRUE), ]
# Create top15_df
top15_df <- data.frame(x=common_words[1:15, 1], y=common_words[1:15, 2], labels=rownames(common_words)[1:15])
# Create the pyramid plot
plotrix::pyramid.plot(top15_df$x, top15_df$y,
labels=top15_df$labels, gap = 12,
top.labels = c("Amzn", "Pro Words", "Google"),
main = "Words in Common", unit = NULL
)
## [1] 5.1 4.1 4.1 2.1
# DO NOT HAVE - THIS IS THE ALL NEGATIVE ASSOCIATIONS
# Created below
goog_c_tdm <- TermDocumentMatrix(goog_cons_corp, control=list(tokenize=tokenizer))
goog_c_tdm_m <- as.matrix(goog_c_tdm)
goog_c_freq <- rowSums(goog_c_tdm_m)
all_tdm_df <- merge(y=data.frame(keyWord=names(goog_c_freq), googNum=goog_c_freq, stringsAsFactors=FALSE),
x=data.frame(keyWord=names(amzn_c_freq), amznNum=amzn_c_freq, stringsAsFactors=FALSE),
by="keyWord", all=TRUE
)
all_tdm_df[is.na(all_tdm_df)] <- 0
all_tdm_m <- as.matrix(all_tdm_df[, -1])
rownames(all_tdm_m) <- all_tdm_df$keyWord
# Create common_words
common_words <- subset(all_tdm_m, all_tdm_m[, 1] > 0 & all_tdm_m[, 2] > 0)
# Create difference
difference <- abs(common_words[, 2] - common_words[, 1])
# Bind difference to common_words
common_words <- cbind(common_words, difference)
# Order the data frame from most differences to least
common_words <- common_words[order(common_words[, 3], decreasing=TRUE), ]
# Create top15_df
top15_df <- data.frame(x=common_words[1:15, 1], y=common_words[1:15, 2], labels=rownames(common_words)[1:15])
# Create the pyramid plot
plotrix::pyramid.plot(top15_df$x, top15_df$y,
labels=top15_df$labels, gap = 12,
top.labels = c("Amzn", "Cons Words", "Google"),
main = "Words in Common", unit = NULL
)
## [1] 4 2 4 2
R is a programming language, while RStudio is a company that created an IDE for R:
Install R and Rstudio - both are free downloads:
Rstudio panes - Console, Environment, File/Plot/Package/Help:
Source pane - good place to write multi-line code prior to running it in the console:
The View() function is the data viewer - just run it at the console, with the name of a frame inside:
Environment pane is in the upper-right corner, keeping track of the R session:
History tab is next to the environment tab, as part of the Environment pane:
Files pane is in the bottom-right corner of the default Rstudio layout:
Plots pane and packages tab - the lower-right series of panes:
Help pane displays the help pages for R objects:
Viewer tab is also in the lower-right pane, and shows html output (if any) produced during the session:
Coding features - when writing in the Source pane for a .R object in Rstudio, R-specific coding and extensions are applied:
Coding diagnostics - Rstudio flags potential errors in the code prior to the code running:
Keyboard shortcuts help save time while writing code in the Source pane:
Multiple cursors can be created within the editor using CTRL-ALT-
Navigate and edit code using SHIFT-ALT-G to jump to any line in the document:
Run scripts frequently to help with checking and debugging code:
Traceback helps to debug errors that occur when you run the code:
Debugger mode is a way of pausing time - run one line, then see how Rstudio sees the code and environment and variables at that specific moment:
Debugger mode: breakpoints can help find what is going wrong, even if the code is not throwing a formal error (wrong result, rather than code bombs out):
Rstudio project for navigating between projects:
Populating projects - assuming starting with File/New Project/New Directory/Empty Project:
Packrat allows for using different versions of a package for different projects (useful for reproducible research):
Introduction to R packages - best way to share functions, vignettes, and the like:
Import and load source files - can add files from the “Add” tab of the File/New Project/New Directory/R Package process:
Package documentation (Part I) - R documentation files have a special format and are saved as .Rd files:
Package documentation (Part II) - filling in the skeleton that roxygen has created:
Package documentation (Part III) - Build/More/Document:
Test packages (Part I) - make sure that all of the functions work, including cross-function dependencies:
Test packages (Part II) - create tests by saving a new script to the test/testthat directory:
Test packages (Part III) - use Build/Test Package from Environment/History tab (CTRL-SHIFT-T works also):
Check packages is an optional component of the package building process:
Build packages - R converts to a single compressed file (.tar.gz) which is known as a “tarball”:
Chapter 2 - Version Control
Introduction to Git (available in Rstudio, along with SVN, to help with collaboration and version control):
Stage and commit - using the Git tab in the Environment/History pane:
Using .gitignore - telling Git that certain files should not be flagged as having differences from the “official version”:
Git Icons - example:
Commit history - accessed through the “Commit” tab of the “History” window:
Undo committed changes: checkout (Git equivalent to the “Undo” button in some other softwares):
Undo committed changes - returning only to the previously committed file does not require a “checkout”:
Introduction to GitHub - the github.com website allows for keeping copies in the cloud, even as collaborator work off-line:
Pull and push - additional layer of complexity that github.com adds to Git:
Chapter 3 - Reporting
Tools for reporting - sharing results to a wider audience (clients, collaborators, etc.):
Introduction to R Markdown - creating all of the code for a reproducible research, plus all of the supporting text:
R Markdown in Rstudio - integration by way of the .Rmd in the source pane:
Rendering R Markdown - available through a GUI system in Rstudio:
Compile notebook - can convert any R Script document to R Markdown using File/Compile Notebook:
Rstudio LaTeX editor - common format used in match and science departments for reporting:
Shiny applications can easily be written, tested, and run using Rstudio:
Publish Shiny apps - place finalized apps on-line:
Robert Muenchen - author of “R for SAS and SPSS Users” and “R for Stata Users”:
Example code includes:
utils::demo("graphics") # nice example of plots and data
##
##
## demo(graphics)
## ---- ~~~~~~~~
##
## > # Copyright (C) 1997-2009 The R Core Team
## >
## > require(datasets)
##
## > require(grDevices); require(graphics)
##
## > ## Here is some code which illustrates some of the differences between
## > ## R and S graphics capabilities. Note that colors are generally specified
## > ## by a character string name (taken from the X11 rgb.txt file) and that line
## > ## textures are given similarly. The parameter "bg" sets the background
## > ## parameter for the plot and there is also an "fg" parameter which sets
## > ## the foreground color.
## >
## >
## > x <- stats::rnorm(50)
##
## > opar <- par(bg = "white")
##
## > plot(x, ann = FALSE, type = "n")
##
## > abline(h = 0, col = gray(.90))
##
## > lines(x, col = "green4", lty = "dotted")
##
## > points(x, bg = "limegreen", pch = 21)
##
## > title(main = "Simple Use of Color In a Plot",
## + xlab = "Just a Whisper of a Label",
## + col.main = "blue", col.lab = gray(.8),
## + cex.main = 1.2, cex.lab = 1.0, font.main = 4, font.lab = 3)
##
## > ## A little color wheel. This code just plots equally spaced hues in
## > ## a pie chart. If you have a cheap SVGA monitor (like me) you will
## > ## probably find that numerically equispaced does not mean visually
## > ## equispaced. On my display at home, these colors tend to cluster at
## > ## the RGB primaries. On the other hand on the SGI Indy at work the
## > ## effect is near perfect.
## >
## > par(bg = "gray")
##
## > pie(rep(1,24), col = rainbow(24), radius = 0.9)
##
## > title(main = "A Sample Color Wheel", cex.main = 1.4, font.main = 3)
##
## > title(xlab = "(Use this as a test of monitor linearity)",
## + cex.lab = 0.8, font.lab = 3)
##
## > ## We have already confessed to having these. This is just showing off X11
## > ## color names (and the example (from the postscript manual) is pretty "cute".
## >
## > pie.sales <- c(0.12, 0.3, 0.26, 0.16, 0.04, 0.12)
##
## > names(pie.sales) <- c("Blueberry", "Cherry",
## + "Apple", "Boston Cream", "Other", "Vanilla Cream")
##
## > pie(pie.sales,
## + col = c("purple","violetred1","green3","cornsilk","cyan","white"))
##
## > title(main = "January Pie Sales", cex.main = 1.8, font.main = 1)
##
## > title(xlab = "(Don't try this at home kids)", cex.lab = 0.8, font.lab = 3)
##
## > ## Boxplots: I couldn't resist the capability for filling the "box".
## > ## The use of color seems like a useful addition, it focuses attention
## > ## on the central bulk of the data.
## >
## > par(bg="cornsilk")
##
## > n <- 10
##
## > g <- gl(n, 100, n*100)
##
## > x <- rnorm(n*100) + sqrt(as.numeric(g))
##
## > boxplot(split(x,g), col="lavender", notch=TRUE)
##
## > title(main="Notched Boxplots", xlab="Group", font.main=4, font.lab=1)
##
## > ## An example showing how to fill between curves.
## >
## > par(bg="white")
##
## > n <- 100
##
## > x <- c(0,cumsum(rnorm(n)))
##
## > y <- c(0,cumsum(rnorm(n)))
##
## > xx <- c(0:n, n:0)
##
## > yy <- c(x, rev(y))
##
## > plot(xx, yy, type="n", xlab="Time", ylab="Distance")
##
## > polygon(xx, yy, col="gray")
##
## > title("Distance Between Brownian Motions")
##
## > ## Colored plot margins, axis labels and titles. You do need to be
## > ## careful with these kinds of effects. It's easy to go completely
## > ## over the top and you can end up with your lunch all over the keyboard.
## > ## On the other hand, my market research clients love it.
## >
## > x <- c(0.00, 0.40, 0.86, 0.85, 0.69, 0.48, 0.54, 1.09, 1.11, 1.73, 2.05, 2.02)
##
## > par(bg="lightgray")
##
## > plot(x, type="n", axes=FALSE, ann=FALSE)
##
## > usr <- par("usr")
##
## > rect(usr[1], usr[3], usr[2], usr[4], col="cornsilk", border="black")
##
## > lines(x, col="blue")
##
## > points(x, pch=21, bg="lightcyan", cex=1.25)
##
## > axis(2, col.axis="blue", las=1)
##
## > axis(1, at=1:12, lab=month.abb, col.axis="blue")
##
## > box()
##
## > title(main= "The Level of Interest in R", font.main=4, col.main="red")
##
## > title(xlab= "1996", col.lab="red")
##
## > ## A filled histogram, showing how to change the font used for the
## > ## main title without changing the other annotation.
## >
## > par(bg="cornsilk")
##
## > x <- rnorm(1000)
##
## > hist(x, xlim=range(-4, 4, x), col="lavender", main="")
##
## > title(main="1000 Normal Random Variates", font.main=3)
##
## > ## A scatterplot matrix
## > ## The good old Iris data (yet again)
## >
## > pairs(iris[1:4], main="Edgar Anderson's Iris Data", font.main=4, pch=19)
##
## > pairs(iris[1:4], main="Edgar Anderson's Iris Data", pch=21,
## + bg = c("red", "green3", "blue")[unclass(iris$Species)])
##
## > ## Contour plotting
## > ## This produces a topographic map of one of Auckland's many volcanic "peaks".
## >
## > x <- 10*1:nrow(volcano)
##
## > y <- 10*1:ncol(volcano)
##
## > lev <- pretty(range(volcano), 10)
##
## > par(bg = "lightcyan")
##
## > pin <- par("pin")
##
## > xdelta <- diff(range(x))
##
## > ydelta <- diff(range(y))
##
## > xscale <- pin[1]/xdelta
##
## > yscale <- pin[2]/ydelta
##
## > scale <- min(xscale, yscale)
##
## > xadd <- 0.5*(pin[1]/scale - xdelta)
##
## > yadd <- 0.5*(pin[2]/scale - ydelta)
##
## > plot(numeric(0), numeric(0),
## + xlim = range(x)+c(-1,1)*xadd, ylim = range(y)+c(-1,1)*yadd,
## + type = "n", ann = FALSE)
##
## > usr <- par("usr")
##
## > rect(usr[1], usr[3], usr[2], usr[4], col="green3")
##
## > contour(x, y, volcano, levels = lev, col="yellow", lty="solid", add=TRUE)
##
## > box()
##
## > title("A Topographic Map of Maunga Whau", font= 4)
##
## > title(xlab = "Meters North", ylab = "Meters West", font= 3)
##
## > mtext("10 Meter Contour Spacing", side=3, line=0.35, outer=FALSE,
## + at = mean(par("usr")[1:2]), cex=0.7, font=3)
##
## > ## Conditioning plots
## >
## > par(bg="cornsilk")
##
## > coplot(lat ~ long | depth, data = quakes, pch = 21, bg = "green3")
##
## > par(opar)
Chapter 2 - Installing and Maintaining R
Installation typically includes both R (www.r-project.org) and R Studio (rstudio.com):
Chapter 3 - Help and Documentation
R Help can be accessed in several ways - help(myFunction) or ?myFunction or ??myFunction:
Chapter 4 - R Studio Basics
R Studio typically has four windows/consoles to work in:
Chapter 5 - Programming Language Basics
Programming Language Basics - R is an Object Oriented Language:
Parentheses and Braces:
Chapter 6 - Data Structures
Introduction to data structures - R has vectors, factors, data frames, arrays, lists, etc., and not just “the data set”:
Obtaining information from vectors:
Factors (categorical variables) and labels:
Data Frames are the closest equivalent to the dataset in other languages:
Matrices and lists:
Example code includes:
# The gender vector
gender <- c("f", "f", "f", NA, "m", "m", "m", "m")
# Create a factor with the labels "Female" and "Male" and print the result
gender <- factor(gender, levels=c("f", "m"), labels=c("Female", "Male"))
gender
## [1] Female Female Female <NA> Male Male Male Male
## Levels: Female Male
# The q1 vector
q1 <- c(1, 2, 2, 3, 4, 5, 5, 4)
# Select the scores of the females
q1[ gender == "Female" ]
## [1] 1 2 2 NA
# Our data so far:
# The vector country
country <- c(1, 2, 1, 2, 1, 2, 1, 2)
# The period vector
period <- c("bc", "bc", "bc", NA, "ac", "ac", "ac", "ac")
# Business hours quarter 1, 2, 3 and 4
QR1 <- c(36, 34, 37, 35, 33, 32, 35, 31)
QR2 <- c(37, 35, 38, 36, 35, 33, 35, 33)
QR3 <- c(39, 37, 40, NA, 36, 35, 37, 35)
QR4 <- c(36, 34, 37, 35, 34, 32, 36, 32)
# Create a data frame of the data of so far and assign it to 'company_data'.
company_data <- data.frame(country, period, QR1, QR2, QR3, QR4, stringsAsFactors=FALSE)
# Print the data frame
company_data
## country period QR1 QR2 QR3 QR4
## 1 1 bc 36 37 39 36
## 2 2 bc 34 35 37 34
## 3 1 bc 37 38 40 37
## 4 2 <NA> 35 36 NA 35
## 5 1 ac 33 35 36 34
## 6 2 ac 32 33 35 32
## 7 1 ac 35 35 37 36
## 8 2 ac 31 33 35 32
mymatrix <- matrix( c(36, 34, 37, 35, 33, 32, 35, 31, 37, 35, 38, 36, 35, 33, 35, 33, 39, 37, 40, NA, 36, 35, 37, 35, 36, 34, 37, 35, 34, 32, 36, 32), nrow=8, ncol=4)
# Construct the same matrix as mymatrix by using the vectors QR1, QR2, QR3 and QR4 and assign to same_matrix.
same_matrix <- cbind(QR1, QR2, QR3, QR4)
# Compute the correlation between the columns of same_matrix by using pairwise deletion of missing values
cor(same_matrix, use="pairwise")
## QR1 QR2 QR3 QR4
## QR1 1.0000000 0.9531986 0.9669876 0.9686649
## QR2 0.9531986 1.0000000 0.9803486 0.9244735
## QR3 0.9669876 0.9803486 1.0000000 0.9193967
## QR4 0.9686649 0.9244735 0.9193967 1.0000000
Chapter 7 - Managing Files and Workspace
Manipulating objects - most languages require operating system commands:
Managing workspace - workind directory is where files will be read/written by default:
Example code includes:
# List all objects that are stored in the workspace.
ls()
## [1] "a_data_frame" "a_factor"
## [3] "a_fancy_microwave" "a_high_end_microwave"
## [5] "a_linear_model" "a_microwave_oven"
## [7] "a_numeric_vector" "acc"
## [9] "acc_full" "acc_g"
## [11] "acc_i" "acc_small"
## [13] "accs" "ads"
## [15] "airquality" "all_chardonnay"
## [17] "all_clean" "all_coffee"
## [19] "all_cols" "all_corpus"
## [21] "all_goog_corp" "all_goog_corpus"
## [23] "all_m" "all_tdm"
## [25] "all_tdm_df" "all_tdm_m"
## [27] "all_tweets" "amzn"
## [29] "amzn_c_freq" "amzn_c_tdm"
## [31] "amzn_c_tdm_m" "amzn_c_tdm2"
## [33] "amzn_cons" "amzn_cons_corp"
## [35] "amzn_p_freq" "amzn_p_m"
## [37] "amzn_p_tdm" "amzn_p_tdm_m"
## [39] "amzn_pros" "amzn_pros_corp"
## [41] "another_microwave_oven" "ascii_pizza_slice"
## [43] "assigned_microwave_oven" "associations"
## [45] "associations_df" "atmos"
## [47] "az_c_corp" "az_p_corp"
## [49] "bbbDescr" "bi_words"
## [51] "bigram_dtm" "bigram_dtm_m"
## [53] "blackChess" "bloodbrain_x"
## [55] "bloodbrain_x_small" "bloodbrain_y"
## [57] "Boston" "BostonHousing"
## [59] "breast_cancer_x" "breast_cancer_y"
## [61] "cars" "cgdp"
## [63] "cgdp_afg" "chardonnay_corp"
## [65] "chardonnay_freqs" "chardonnay_m"
## [67] "chardonnay_source" "chardonnay_tdm"
## [69] "chardonnay_tweets" "chardonnay_words"
## [71] "chess" "choco_data"
## [73] "churn_x" "churn_y"
## [75] "churnTest" "churnTrain"
## [77] "clean_chardonnay" "clean_corp"
## [79] "clean_corpus" "cloned_microwave_oven"
## [81] "coffee_corpus" "coffee_dtm"
## [83] "coffee_m" "coffee_source"
## [85] "coffee_tdm" "coffee_tweets"
## [87] "common_words" "comp"
## [89] "comp_dict" "compA"
## [91] "company_data" "compB"
## [93] "complete_text" "complicate"
## [95] "conf" "conf_full"
## [97] "conf_g" "conf_i"
## [99] "conf_small" "country"
## [101] "crime_data" "crime_data_sc"
## [103] "crime_km" "crime_single"
## [105] "curCol" "curMeans"
## [107] "curRow" "curSD"
## [109] "custom_reader" "cylSplit"
## [111] "data.dist" "data.scaled"
## [113] "days" "desMeans"
## [115] "desSD" "df_corpus"
## [117] "df_source" "diagnosis"
## [119] "diamonds" "difference"
## [121] "dist_matrix" "dist_rain"
## [123] "do_math" "draw_roc_lines"
## [125] "dunn_complete" "dunn_km"
## [127] "dunn_km_sc" "dunn_single"
## [129] "e" "eachState"
## [131] "emails" "emails_full"
## [133] "emails_small" "env"
## [135] "env_microwave_oven_factory" "env2"
## [137] "error" "example_kelvin"
## [139] "example_text" "fancy_microwave_oven_factory"
## [141] "fancy_microwave_power_rating" "FN"
## [143] "foo" "FP"
## [145] "freq" "frequency"
## [147] "frequency2" "funDummy"
## [149] "future_days" "g"
## [151] "gender" "get_n_elements"
## [153] "get_n_elements.data.frame" "get_n_elements.default"
## [155] "goog" "goog_c_corp"
## [157] "goog_c_freq" "goog_c_tdm"
## [159] "goog_c_tdm_m" "goog_cons"
## [161] "goog_cons_corp" "goog_df"
## [163] "goog_p_corp" "goog_p_freq"
## [165] "goog_p_tdm" "goog_p_tdm_m"
## [167] "goog_pros" "goog_pros_corp"
## [169] "goog_vec" "hc"
## [171] "hcd" "hclust.average"
## [173] "hclust.complete" "hclust.out"
## [175] "hclust.pokemon" "hclust.single"
## [177] "high_end_microwave_oven_factory" "i"
## [179] "idxTrain" "indices"
## [181] "intCtr" "inv"
## [183] "iris" "k"
## [185] "kang_nose" "keyIdx"
## [187] "keyNames" "keyStateNames"
## [189] "keyStateNums" "kitty"
## [191] "km.out" "km_cars"
## [193] "km_seeds" "kmeans_iris"
## [195] "knn_test" "knn_train"
## [197] "last_5" "lastChristmasNoon"
## [199] "lev" "linkedin"
## [201] "linkedin_lm" "linkedin_pred"
## [203] "listA" "lm_choco"
## [205] "lm_kang" "lm_shop"
## [207] "lm_wage" "lm_wb"
## [209] "lm_wb_log" "logBBB"
## [211] "lst" "lst2"
## [213] "max_age" "max_class"
## [215] "me" "means"
## [217] "memb_complete" "memb_single"
## [219] "microwave_oven" "microwave_oven_factory"
## [221] "microwave_power_rating" "min_age"
## [223] "min_class" "mod"
## [225] "mod2" "mod3"
## [227] "model" "model_glmnet"
## [229] "model_list" "model_rf"
## [231] "model1" "model2"
## [233] "mpgRange" "mpgScale"
## [235] "mtcars" "mtxTest"
## [237] "my_class" "my_iris"
## [239] "my_knn" "myControl"
## [241] "myFolds" "mymatrix"
## [243] "myVal" "myWords"
## [245] "n" "n_elements_ability.cov"
## [247] "n_elements_sleep" "n_smart"
## [249] "new_stops" "new_text"
## [251] "nextUMHomeGame" "nms"
## [253] "nose_length_est" "nose_width_new"
## [255] "nRed" "nWhite"
## [257] "opar" "other_199"
## [259] "p" "p_class"
## [261] "period" "pie.sales"
## [263] "pin" "pokemon"
## [265] "pokemon.scaled" "pokeTotal"
## [267] "poss_log10" "pr.out"
## [269] "pr.var" "pr.with.scaling"
## [271] "pr.without.scaling" "prec"
## [273] "pred" "pred_full"
## [275] "pred_g" "pred_i"
## [277] "pretty_titles" "prevData"
## [279] "previous_4" "prop_less"
## [281] "pruned" "purple_orange"
## [283] "pve" "q1"
## [285] "qdap_clean" "QR1"
## [287] "QR2" "QR3"
## [289] "QR4" "r_sq"
## [291] "rain" "range"
## [293] "ranks" "ratio_ss"
## [295] "rawTweets" "rec"
## [297] "redWine" "remove_cols"
## [299] "res" "res_test"
## [301] "resamps" "rightNow"
## [303] "rmse" "rmse_test"
## [305] "rmse_train" "rows"
## [307] "run_complete" "run_dist"
## [309] "run_km" "run_km_sc"
## [311] "run_record" "run_record_sc"
## [313] "run_single" "safe_log10"
## [315] "sales" "same_matrix"
## [317] "sampMsg" "scale"
## [319] "school_km" "school_result"
## [321] "seeds" "seeds_km"
## [323] "seeds_km_1" "seeds_km_2"
## [325] "seeds_type" "shop_data"
## [327] "shop_new" "shuffled"
## [329] "size_dist" "some_vars"
## [331] "Sonar" "spam"
## [333] "spam_classifier" "spam_pred"
## [335] "species" "split"
## [337] "sq_ft" "ss_res"
## [339] "ss_tot" "stateCols"
## [341] "stateDF" "stem_doc"
## [343] "tdm_df" "tdm_m"
## [345] "tdm1" "tdm2"
## [347] "term_count" "term_frequency"
## [349] "test" "test_labels"
## [351] "test_output" "test_output_knn"
## [353] "test_output_lm" "test_output_lm_log"
## [355] "text" "text_corp"
## [357] "tf_tdm" "tf_tdm_m"
## [359] "tfidf_tdm" "tfidf_tdm_m"
## [361] "titanic" "titanic_train"
## [363] "tm_clean" "TN"
## [365] "tokenizer" "top_grades"
## [367] "top15_df" "top25_df"
## [369] "TP" "train"
## [371] "train_indices" "train_labels"
## [373] "tree" "tree_g"
## [375] "tree_i" "trIdx"
## [377] "tweets_dist" "tweets_tdm"
## [379] "tweets_tdm2" "type_info"
## [381] "unigram_dtm" "unseen"
## [383] "urb_pop" "url"
## [385] "usr" "v1"
## [387] "v2" "v3"
## [389] "v4" "v5"
## [391] "v6" "vec_corpus"
## [393] "vec_source" "Wage"
## [395] "what_am_i" "what_am_i.cat"
## [397] "what_am_i.character" "what_am_i.mammal"
## [399] "whiteChess" "whiteWine"
## [401] "wine" "wisc.data"
## [403] "wisc.df" "wisc.hclust"
## [405] "wisc.hclust.clusters" "wisc.km"
## [407] "wisc.pr" "wisc.pr.hclust"
## [409] "wisc.pr.hclust.clusters" "word_freqs"
## [411] "world_bank_test" "world_bank_test_input"
## [413] "world_bank_test_output" "world_bank_test_truth"
## [415] "world_bank_train" "worst_grades"
## [417] "wss" "x"
## [419] "xadd" "xdelta"
## [421] "xFactorNon" "xFactorOrder"
## [423] "xRaw" "xscale"
## [425] "xx" "y"
## [427] "yadd" "ydelta"
## [429] "year" "yscale"
## [431] "yy"
# List all objects in the workspace with a "q" in their name.
ls(pattern = "q")
## [1] "airquality" "amzn_c_freq" "amzn_p_freq"
## [4] "chardonnay_freqs" "freq" "frequency"
## [7] "frequency2" "goog_c_freq" "goog_p_freq"
## [10] "q1" "qdap_clean" "r_sq"
## [13] "sq_ft" "term_frequency" "word_freqs"
# The workshop and businesshours data frame are already loaded in your workspace
businesshours <- company_data
workshop <- data.frame(workshop=c(1, 2, 1, 2, 1, 2, 1, 2),
gender=c("f", "f", "f", NA, "m", "m", "m", "m"),
q1=c(1, 2, 2, 3, 4, 5, 5, 4), q2=c(1, 1, 2, 1, 5, 4, 3, 5),
q3=c(5, 4, 4, NA, 2, 5, 4, 5), q4=c(1, 1, 3, 3, 4, 5, 4, 5)
)
# Have a look at the first three rows of the `workshop` factor.
head(workshop, n=3)
## workshop gender q1 q2 q3 q4
## 1 1 f 1 1 5 1
## 2 2 f 2 1 4 1
## 3 1 f 2 2 4 3
# Have a look at the structure part of the `workshop` factor.
str(workshop)
## 'data.frame': 8 obs. of 6 variables:
## $ workshop: num 1 2 1 2 1 2 1 2
## $ gender : Factor w/ 2 levels "f","m": 1 1 1 NA 2 2 2 2
## $ q1 : num 1 2 2 3 4 5 5 4
## $ q2 : num 1 1 2 1 5 4 3 5
## $ q3 : num 5 4 4 NA 2 5 4 5
## $ q4 : num 1 1 3 3 4 5 4 5
# Have a look at the last four rows of the `businesshours` data frame.
tail(businesshours, n=4)
## country period QR1 QR2 QR3 QR4
## 5 1 ac 33 35 36 34
## 6 2 ac 32 33 35 32
## 7 1 ac 35 35 37 36
## 8 2 ac 31 33 35 32
# Have a look at the attributes of the `businesshours` data frame.
attributes(businesshours)
## $names
## [1] "country" "period" "QR1" "QR2" "QR3" "QR4"
##
## $row.names
## [1] 1 2 3 4 5 6 7 8
##
## $class
## [1] "data.frame"
# Assign the objects with the character q in their name to a variable 'objects_with_q'
objects_with_q <- ls(pattern = "q")
# Remove the objects (print them instead)
print(objects_with_q)
## [1] "airquality" "amzn_c_freq" "amzn_p_freq"
## [4] "chardonnay_freqs" "freq" "frequency"
## [7] "frequency2" "goog_c_freq" "goog_p_freq"
## [10] "q1" "qdap_clean" "r_sq"
## [13] "sq_ft" "term_frequency" "word_freqs"
# rm(list = objects_with_q)
Chapter 8 - Controlling Functions
Functions and Arguments - R is controlled by functions that are called with values passed to arguments:
Classes - generic functions offer different methods for each class of objects:
Example code includes:
QR1 <- c(36, 34, 37, 35, 33, 32, 35, 31)
QR2 <- c(37, 35, 38, 36, 35, 33, 35, 33)
QR3 <- c(39, 37, 40, NA, 36, 35, 37, 35)
# Correct the code below
summary(data.frame(QR1, QR2, QR3))
## QR1 QR2 QR3
## Min. :31.00 Min. :33.00 Min. :35.0
## 1st Qu.:32.75 1st Qu.:34.50 1st Qu.:35.5
## Median :34.50 Median :35.00 Median :37.0
## Mean :34.12 Mean :35.25 Mean :37.0
## 3rd Qu.:35.25 3rd Qu.:36.25 3rd Qu.:38.0
## Max. :37.00 Max. :38.00 Max. :40.0
## NA's :1
# Verify the class of QR1
class(QR1)
## [1] "numeric"
# Change the class of QR1 to character
QR1_char <- as.character(QR1)
# Verify the class
class(QR1_char)
## [1] "character"
QR4 <- c(36, 34, 37, 35, 34, 32, 36, 32)
# Code block 1
# meanQR1 <- mean(QR1)
# meanQR2 <- mean(QR2)
# meanQR3 <- mean(QR3)
# meanQR4 <- mean(QR4)
max(c(mean(QR1), mean(QR2), mean(QR3), mean(QR4)))
## [1] NA
# Code block 2
# maxQR1 <- max(QR1)
# maxQR2 <- max(QR2)
# maxQR3 <- max(QR3)
# maxQR4 <- max(QR4)
min(c(max(QR1), max(QR2), max(QR3), max(QR4)))
## [1] NA
# Code block 3
# sum_element_wise <- QR1 + QR2 + QR3 + QR4
# log_q <- log(sum_element_wise)
quantile(log(QR1 + QR2 + QR3 + QR4), na.rm=TRUE)
## 0% 25% 50% 75% 100%
## 4.875197 4.905028 4.941642 4.980028 5.023881
Chapter 9 - Data Acquisition
Loading CSV file - from local or from URL:
Loading other data - for example, tab-delimited files (TSV):
Example code includes:
# Load the workshop data and assign them to the variable below
mydata <-read.csv("http://bit.ly/bob_mydata_csv", strip.white=TRUE, na.strings="")
# Print mydata
mydata
## workshop gender q1 q2 q3 q4
## 1 1 f 1 1 5 1
## 2 2 f 2 1 4 1
## 3 1 f 2 2 4 3
## 4 2 <NA> 3 1 NA 3
## 5 1 m 4 5 2 4
## 6 2 m 5 4 5 5
## 7 1 m 5 3 4 4
## 8 2 m 4 5 5 5
# Load the library
library(sas7bdat)
# Load the workshop tab file with the right arguments and assign them to the variable 'mydata_tab'
mydata_tab <- read.delim("http://bit.ly/bob_mydata_tab", strip.white=TRUE, na.strings="")
# Load the workshop SAS file and assign them to the variable 'mydata_sas'
mydata_sas <- read.sas7bdat("http://bit.ly/bob_mydata_sas7bdat")
# Print both variables
mydata_tab
## workshop gender q1 q2 q3 q4
## 1 1 f 1 1 5 1
## 2 2 f 2 1 4 1
## 3 1 f 2 2 4 3
## 4 2 <NA> 3 1 NA 3
## 5 1 m 4 5 2 4
## 6 2 m 5 4 5 5
## 7 1 m 5 3 4 4
## 8 2 m 4 5 5 5
mydata_sas
## id workshop gender q1 q2 q3 q4
## 1 1 1 f 1 1 5 1
## 2 2 2 f 2 1 4 1
## 3 3 1 f 2 2 4 3
## 4 4 2 . 3 1 NaN 3
## 5 5 1 m 4 5 2 4
## 6 6 2 m 5 4 5 5
## 7 7 1 m 5 3 4 4
## 8 8 2 m 4 5 5 9
# The workshop data as a string
mystring <- "workshop,gender,q1,q2,q3,q4
1,1,f,1,1,5,1
2,2,f,2,1,4,1
3,1,f,2,2,4,3
4,2, ,3,1, ,3
5,1,m,4,5,2,4
6,2,m,5,4,5,5
7,1,m,5,3,4,4
8,2,m,4,5,5,5"
# Read the workshop from the string and assign it to the variable below
mydata <- read.csv(textConnection(mystring), strip.white=TRUE, na.strings="")
# Print mydata
mydata
## workshop gender q1 q2 q3 q4
## 1 1 f 1 1 5 1
## 2 2 f 2 1 4 1
## 3 1 f 2 2 4 3
## 4 2 <NA> 3 1 NA 3
## 5 1 m 4 5 2 4
## 6 2 m 5 4 5 5
## 7 1 m 5 3 4 4
## 8 2 m 4 5 5 5
Chapter 10 - Missing Values
Missing value codes include NA (not available) and NaN (not a number):
Dealing with missing values:
Example code includes:
QR3 <- c(39, 37, 40, NA, 36, 35, 37, 35)
# Create a function to calculate the number of missing values.
n.missing <- function(x) {sum(is.na(x))}
# Use n.missing to calculate the number of missing values of QR3.
missing_count <- n.missing(QR3)
missing_count
## [1] 1
# The vector random_vector is preloaded in the workspace.
random_vector <- c(1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3)
# Set all the 3s in random_vector to missing
random_vector[random_vector == 3] <- NA
# Print the new vector
random_vector
## [1] 1 2 NA 1 2 NA 1 2 NA 1 2 NA 1 2 NA
QR1 <- c(36, 34, 37, 35, 33, 32, 35, NA)
tempData <- c(36, 34, 37, 35, 33, 32, 35, NA, 37, 35, 38, 36, 35, 33, 35, 33, 39, 37, 40, NA, 36, 35, NA, 35, 36, NA, 37, 35, 34, 32, 36, 32)
my_QR_data <- as.data.frame(matrix(tempData, ncol=4))
names(my_QR_data) <- c("QR1", "QR2", "QR3", "QR4")
# Print the vector 'QR1' and inspect it
QR1
## [1] 36 34 37 35 33 32 35 NA
# Print the data frame 'my_QR_data' and inspect it
my_QR_data
## QR1 QR2 QR3 QR4
## 1 36 37 39 36
## 2 34 35 37 NA
## 3 37 38 40 37
## 4 35 36 NA 35
## 5 33 35 36 34
## 6 32 33 35 32
## 7 35 35 NA 36
## 8 NA 33 35 32
# Calculate the mean of 'QR1' by excluding the missing values
mean(QR1, na.rm=TRUE)
## [1] 34.57143
# Remove all rows that contain any missing values from 'my_QR_data'.
na.omit(my_QR_data)
## QR1 QR2 QR3 QR4
## 1 36 37 39 36
## 3 37 38 40 37
## 5 33 35 36 34
## 6 32 33 35 32
Chapter 11 - Selecting variables
Selecting Variables (1) - selecting variables from data frames is different in R and other statistical packages:
Selecting Variables (2) - can also be done through subsetting or indexing:
dplyr Package - simplifying variable selection using library(dplyr):
Example code includes:
tempData <- c(36, 34, 37, 35, 33, 32, 35, 33, 37, 35, 38, 36, 35, 33, 35, 33, 39, 37, 40, 39, 36, 35, 36, 35, 36, 33, 37, 35, 34, 32, 36, 32)
businesshours <- as.data.frame(matrix(tempData, ncol=4))
names(businesshours) <- c("QR1", "QR2", "QR3", "QR4")
# Select the QR1 variable of businesshours
my_QR1_selection <- businesshours$QR1
# Make a summary of the variables QR2 and QR3 of the data frame businesshours.
summary(data.frame(businesshours$QR2, businesshours$QR3))
## businesshours.QR2 businesshours.QR3
## Min. :33.00 Min. :35.00
## 1st Qu.:34.50 1st Qu.:35.75
## Median :35.00 Median :36.50
## Mean :35.25 Mean :37.12
## 3rd Qu.:36.25 3rd Qu.:39.00
## Max. :38.00 Max. :40.00
# Attach the businesshours variable to the temporary work area.
attach(businesshours)
## The following objects are masked _by_ .GlobalEnv:
##
## QR1, QR2, QR3, QR4
# Select the QR1 variable of businesshours and assign it to my_QR1_selection.
my_QR1_selection <- QR1
# Make a summary of the variables QR2 and QR3 of the data frame businesshours.
summary(data.frame(QR2, QR3))
## QR2 QR3
## Min. :33.00 Min. :35.0
## 1st Qu.:34.50 1st Qu.:35.5
## Median :35.00 Median :37.0
## Mean :35.25 Mean :37.0
## 3rd Qu.:36.25 3rd Qu.:38.0
## Max. :38.00 Max. :40.0
## NA's :1
# Detach the businesshours variable of the temporary work area.
detach(businesshours)
# Select the QR1 variable of businesshours using the with function and assign it my_QR1_selection
my_QR1_selection <- with(businesshours, QR1)
# Make a summary of the variables QR2 and QR3 of the data frame businesshours by using the with function.
summary(with(businesshours, data.frame(QR2, QR3)))
## QR2 QR3
## Min. :33.00 Min. :35.00
## 1st Qu.:34.50 1st Qu.:35.75
## Median :35.00 Median :36.50
## Mean :35.25 Mean :37.12
## 3rd Qu.:36.25 3rd Qu.:39.00
## Max. :38.00 Max. :40.00
# Select the QR1 variable of businesshours
my_QR1_selection <- businesshours[, "QR1"]
# Make a summary of the variables QR2 and QR3 of businesshours.
summary(businesshours[, c("QR2", "QR3")])
## QR2 QR3
## Min. :33.00 Min. :35.00
## 1st Qu.:34.50 1st Qu.:35.75
## Median :35.00 Median :36.50
## Mean :35.25 Mean :37.12
## 3rd Qu.:36.25 3rd Qu.:39.00
## Max. :38.00 Max. :40.00
businesshours$country <- c(1, 2, 1, 2, 1, 2, 1, 2)
businesshours$period <- c("bc", "bc", "bc", "bc", "ab", "ab", "ab", "ab")
# t-test of QR4 as function of period and assign it to t_test_1.
t_test_1 <- t.test(QR4 ~ period, data = businesshours)
# A paired t-test comparing QR1 and QR2 and assign it to t_test_2.
t_test_2 <- with(businesshours, t.test(QR1, QR2, paired=TRUE))
# Load the dplyr package into the memory.
library(dplyr)
# Use the select() function to select all variables starting with the variable "period" until "QR3" and all the variables in between them.
select(businesshours, period:QR3)
## period country QR4 QR3
## 1 bc 1 36 39
## 2 bc 2 33 37
## 3 bc 1 37 40
## 4 bc 2 35 39
## 5 ab 1 34 36
## 6 ab 2 32 35
## 7 ab 1 36 36
## 8 ab 2 32 35
# Use the select() function to select all variables that contain "o".
select(businesshours, dplyr::contains("o"))
## country period
## 1 1 bc
## 2 2 bc
## 3 1 bc
## 4 2 bc
## 5 1 ab
## 6 2 ab
## 7 1 ab
## 8 2 ab
# Use the select() function to select all variables that starts_with "Q".
select(businesshours, starts_with("Q"))
## QR1 QR2 QR3 QR4
## 1 36 37 39 36
## 2 34 35 37 33
## 3 37 38 40 37
## 4 35 36 39 35
## 5 33 35 36 34
## 6 32 33 35 32
## 7 35 35 36 36
## 8 33 33 35 32
# Use the `select()` function to select all variables with a numeric range from 2 to 4 and starting with "QR".
select(businesshours, num_range("QR", 2:4))
## QR2 QR3 QR4
## 1 37 39 36
## 2 35 37 33
## 3 38 40 37
## 4 36 39 35
## 5 35 36 34
## 6 33 35 32
## 7 35 36 36
## 8 33 35 32
# Use the `select()` function to select all variables that DO NOT have a numeric range from 2 to 4 and starts with "QR".
select(businesshours, -num_range("QR", 2:4))
## QR1 country period
## 1 36 1 bc
## 2 34 2 bc
## 3 37 1 bc
## 4 35 2 bc
## 5 33 1 ab
## 6 32 2 ab
## 7 35 1 ab
## 8 33 2 ab
# Make a summary of QR1 and QR2 by nesting the select() function.
summary(select(businesshours, QR1, QR2))
## QR1 QR2
## Min. :32.00 Min. :33.00
## 1st Qu.:33.00 1st Qu.:34.50
## Median :34.50 Median :35.00
## Mean :34.38 Mean :35.25
## 3rd Qu.:35.25 3rd Qu.:36.25
## Max. :37.00 Max. :38.00
# Calculate the mean of QR3 with the mean() function.
mean(businesshours$QR3)
## [1] 37.125
Chapter 12 - Selecting Observations
Selecting observations from data frames using two main techniques:
Logic rules and functions:
Example code includes:
# Select the observations of businesshours from the period before the crisis ("bc").
businesshours[businesshours$period == "bc", ]
## QR1 QR2 QR3 QR4 country period
## 1 36 37 39 36 1 bc
## 2 34 35 37 33 2 bc
## 3 37 38 40 37 1 bc
## 4 35 36 39 35 2 bc
# Select the observations of businesshours with an average number of business hours in the first quarter (QR1) bigger than 34 and smaller than or equal to 36 and make a summary of this.
summary(businesshours[businesshours$QR1 > 34 & businesshours$QR1 <= 36, ])
## QR1 QR2 QR3 QR4
## Min. :35.00 Min. :35.0 Min. :36.0 Min. :35.00
## 1st Qu.:35.00 1st Qu.:35.5 1st Qu.:37.5 1st Qu.:35.50
## Median :35.00 Median :36.0 Median :39.0 Median :36.00
## Mean :35.33 Mean :36.0 Mean :38.0 Mean :35.67
## 3rd Qu.:35.50 3rd Qu.:36.5 3rd Qu.:39.0 3rd Qu.:36.00
## Max. :36.00 Max. :37.0 Max. :39.0 Max. :36.00
## country period
## Min. :1.000 Length:3
## 1st Qu.:1.000 Class :character
## Median :1.000 Mode :character
## Mean :1.333
## 3rd Qu.:1.500
## Max. :2.000
# Load the appropriate package
library(dplyr)
# Select the observations of businesshours from the period before the crisis ("bc") using the filter() function from the dplyr package.
filter(businesshours, period == "bc")
## QR1 QR2 QR3 QR4 country period
## 1 36 37 39 36 1 bc
## 2 34 35 37 33 2 bc
## 3 37 38 40 37 1 bc
## 4 35 36 39 35 2 bc
# Select the observations of businesshours with an average number of business hours in the first quarter (QR1) bigger than 34 and smaller than or equal to 36 using the filter() function from the dplyr package and make a summary of this.
summary(filter(businesshours, QR1 > 34 & QR1 <= 36))
## QR1 QR2 QR3 QR4
## Min. :35.00 Min. :35.0 Min. :36.0 Min. :35.00
## 1st Qu.:35.00 1st Qu.:35.5 1st Qu.:37.5 1st Qu.:35.50
## Median :35.00 Median :36.0 Median :39.0 Median :36.00
## Mean :35.33 Mean :36.0 Mean :38.0 Mean :35.67
## 3rd Qu.:35.50 3rd Qu.:36.5 3rd Qu.:39.0 3rd Qu.:36.00
## Max. :36.00 Max. :37.0 Max. :39.0 Max. :36.00
## country period
## Min. :1.000 Length:3
## 1st Qu.:1.000 Class :character
## Median :1.000 Mode :character
## Mean :1.333
## 3rd Qu.:1.500
## Max. :2.000
# Print a logical vector which indicates which elements of period from businesshours are equal to "bc".
businesshours$period == "bc"
## [1] TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE
# Print the indices of period from businesshours which are equal to "ab".
which(businesshours$period == "ab")
## [1] 5 6 7 8
# Find out whether there are subjects of period from businesshours equal to "bc".
any(businesshours$period == "bc")
## [1] TRUE
# Find out how many subjects of period from businesshours are equal to "bc".
sum(businesshours$period == "bc", na.rm=TRUE)
## [1] 4
# Find out whether all the subjects of period from businesshours are equal to "bc".
all(businesshours$period == "bc")
## [1] FALSE
Chapter 13 - Selecting Variables and Observations
Selecting variables and observations - use both row and column portions of subsetting:
Can also use dplyr to combine dplyr::select() and dplyr::filter():
Example code includes:
# Create a character vector with the variables: "period", "QR1" and "QR2" and call it `myVars`.
myVars <- c("period", "QR1", "QR2")
# Create a vector with the observations of the period equal to "bc" and call this vector `myObs`.
myObs <- which(businesshours$period == "bc")
# Select, with the two vectors from above, the variables and observations from `businesshours` by subscripting.
# Save this selection in 'mySubset', print it and make summary of it.
mySubset <- businesshours[myObs, myVars]
mySubset
## period QR1 QR2
## 1 bc 36 37
## 2 bc 34 35
## 3 bc 37 38
## 4 bc 35 36
summary(mySubset)
## period QR1 QR2
## Length:4 Min. :34.00 Min. :35.00
## Class :character 1st Qu.:34.75 1st Qu.:35.75
## Mode :character Median :35.50 Median :36.50
## Mean :35.50 Mean :36.50
## 3rd Qu.:36.25 3rd Qu.:37.25
## Max. :37.00 Max. :38.00
# Use the select() to create mySubset1 with variables period, QR1 and QR2 (in this order) selected from businesshours.
mySubset1 <- select(businesshours, period, QR1, QR2)
# Use the filter() function to select from mySubset1 the observations with the period equal to "bc". Store the result in mySubset2
mySubset2 <- filter(mySubset1, period == "bc")
# Make a summary of mySubset2
summary(mySubset2)
## period QR1 QR2
## Length:4 Min. :34.00 Min. :35.00
## Class :character 1st Qu.:34.75 1st Qu.:35.75
## Mode :character Median :35.50 Median :36.50
## Mean :35.50 Mean :36.50
## 3rd Qu.:36.25 3rd Qu.:37.25
## Max. :37.00 Max. :38.00
Chapter 14 - Transformations
Transformations - making new variables, particularly easy with dplyr::mutate():
Example code includes:
yourdata <- select(businesshours, QR1, QR2, QR3, QR4)
names(yourdata) <- c("A", "B", "C", "D")
# Copy the data frame `yourdata` and assign it to `yourdata2`.
yourdata2 <- yourdata
# Subtract all the observations of the `A` variable from the observations of `D` variable and assign it to `yourdata2$diff`.
yourdata2$diff <- yourdata2$D - yourdata2$A
# Divide all the observations from the `D` variable through the observations of `A` variable and assign it to `yourdata2$ratio`.
yourdata2$ratio <- yourdata2$D / yourdata2$A
# Compute the logarithm of the `D` variable and assign it to `yourdata2$Dlog`.
yourdata2$Dlog <- log(yourdata2$D)
# Calculate the mean of the variables `A`, `B`, `C` and `D` and assign it to `yourdata2$mean`.
yourdata2$mean <- (yourdata2$A + yourdata2$B + yourdata2$C + yourdata2$D) / 4
# Copy the data frame `yourdata` and assign it to `yourdata2`.
yourdata2 <- yourdata
# Subtract all the observations of the `A` variable from the observations of `D` variable and assign it to `yourdata2[,"diff"]`.
yourdata2[,"diff"] <- yourdata2[, "D"] - yourdata2[, "A"]
# Divide all the observations from the `D` variable through the observations of `A` variable and assign it to `yourdata2[,"ratio"]`.
yourdata2[,"ratio"] <- yourdata2[, "D"] / yourdata2[, "A"]
# Compute the logarithm of the `D` variable and assign it to `yourdata2[,"Dlog"]`.
yourdata2[,"Dlog"] <- log(yourdata2[, "D"])
# Calculate the mean of the variables `A`, `B`, `C` and `D` and assign it to `yourdata2[,"mean"]`.
yourdata2[,"mean"] <- (yourdata2[, "A"] + yourdata2[, "B"] + yourdata2[, "C"] + yourdata2[, "D"]) / 4
yourdata2 <- mutate(yourdata, diff = D - A, ratio = D / A, Dlog = log(D), mean = (A + B + C + D) / 4)
x <- 17
y <- 13 / 3
# Calculate `x` to the power 5
x ** 5
## [1] 1419857
# Calculate the exponential function of `x`
exp(x)
## [1] 24154953
# Round the square root of `y` to 2 digits after the comma
round(sqrt(y), 2)
## [1] 2.08
# Calculate the round-off error from the previous instruction
abs(sqrt(y) - round(sqrt(y), 2))
## [1] 0.001665999
Chapter 15 - Graphics
Traditional or base graphics - first graphics package available to R, revolving around the generic plot() function:
Embellishments (1) - customizing base graphics:
Plotting Groups (1) - the plot functions primary weakness in Bob’s opinion:
Scatter plot with regression - using abline() for the line:
The ggplot2 package (1) - “grammar of graphics” by Lee Wilkinson, as implemented by Hadley Wickham:
The ggplot2 package (2) - can add embellishments in many ways:
Embellishments (2) - adding to the ggplot:
Interactive graphics and graphics resources - like JMP or SAS/INSIGHT and the like:
Example code includes:
workshop = factor(c('R', 'SPSS', 'SPSS', 'SPSS', 'Stata', 'SPSS', 'R', 'R', 'SPSS', 'SPSS', 'SPSS', 'SPSS', 'SAS', 'Stata', 'SAS', 'Stata', 'SAS', 'SAS', 'R', 'R', 'SAS', 'SAS', 'R', 'R', 'R', 'Stata', 'SPSS', 'Stata', 'Stata', 'R', 'SAS', 'SAS', 'SAS', 'SPSS', 'R', 'Stata', 'R', 'SAS', 'Stata', 'Stata', 'SPSS', 'SPSS', 'SAS', 'SPSS', 'SAS', 'SPSS', 'SPSS', 'SAS', 'R', 'Stata', 'R', 'SAS', 'SPSS', 'SPSS', 'R', 'SPSS', 'SAS', 'Stata', 'R', 'Stata', 'Stata', 'R', 'SAS', 'R', 'R', 'SPSS', 'SAS', 'SPSS', 'R', 'SPSS', 'R', 'Stata', 'R', 'Stata', 'R', 'SPSS', 'SAS', 'R', 'SAS', 'SPSS', 'Stata', 'SAS', 'R', 'SPSS', 'R', 'Stata', 'SAS', 'SAS', 'R', 'Stata', 'R', 'Stata', 'R', 'R', 'R', 'SPSS', 'SAS', 'R', 'SAS', 'SPSS'), levels=c("R", "SAS", "SPSS", "Stata"))
gender = factor(c('Female', 'Male', 'Male', 'Female', 'Female', 'Female', 'Female', 'Female', 'Female', 'Female', 'Male', 'Female', 'Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Female', 'Male', 'Male', 'Male', 'Male', 'Male', 'Female', 'Female', 'Female', 'Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Female', 'Male', 'Female', 'Male', 'Male', 'Male', 'Male', 'Male', 'Male', 'Female', 'Female', 'Female', 'Male', 'Male', 'Male', 'Male', 'Male', 'Female', 'Female', 'Female', 'Male', 'Female', 'Male', 'Male', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Female', 'Female', 'Male', 'Female', 'Female', 'Male', 'Female', 'Male', 'Male', 'Male', 'Female', 'Female', 'Male', 'Female', 'Male', 'Female', 'Female', 'Female', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Male', 'Female', 'Male', 'Male', 'Male', 'Female', 'Male', 'Male', 'Male', 'Male', 'Male', 'Male', 'Male'), levels=c("Female", "Male"))
# Plot the workshop factor on the x-axis and the gender factor on the y-axis
plot(workshop, gender)
# Plot the gender factor on the x-axis and the workshop factor on the y-axis
plot(gender, workshop)
pretest = c(72, 70, 74, 80, 75, 72, 72, 83, 73, 79, 82, 77, 73, 75, 73, 81, 74, 83, 72, 72, 76, 75, 72, 67, 75, 71, 80, 70, 81, 72, 76, 79, 72, 78, 75, 69, 74, 83, 74, 71, 75, 77, 80, 81, 76, 81, 71, 63, 73, 72, 63, 78, 71, 74, 67, 78, 84, 71, 74, 85, 80, 85, 75, 74, 72, 82, 69, 77, 75, 86, 72, 67, 76, 75, 71, 76, 74, 72, 78, 73, 66, 62, 72, 82, 79, 81, 80, 77, 67, 76, 83, 58, 71, 81, 78, 79, 77, 78, 75, 70)
posttest = c(80, 75, 78, 82, 81, 77, 88, 92, 76, 84, 83, 81, 76, 74, 77, 84, 82, 86, 86, 84, 77, 81, 84, 79, 89, 76, 90, 75, 82, 86, 77, 78, 75, 81, 85, 79, 91, 90, 75, 76, 81, 82, 86, 83, 77, 90, 77, 67, 86, 83, 76, 87, 80, 78, 81, 81, 85, 72, 86, 95, 85, 95, 81, 88, 80, 84, 68, 78, 84, 90, 88, 75, 89, 78, 83, 83, 77, 87, 86, 75, 69, 71, 79, 88, 92, 96, 77, 79, 81, 86, 98, 59, 90, 88, 87, 84, 89, 92, 82, 80)
# Plot the workshop factor against the pretest variable
plot(workshop, pretest)
# Plot the pretest variable against the workshop factor
plot(pretest, workshop)
# Plot the posttest variable against the pretest variable
plot(posttest, pretest)
# Make a histogram of the pretest variable and add ticks to it
hist(pretest)
rug(pretest)
# Plot the posttest variable against the pretest variable and add all the embellishments
plot(posttest, pretest, pch=3, cex=0.5, main="Embellished plot", xlab="X values", ylab="Y values")
grid()
# Plot the pretest variable against the posttest variable and include a regression analysis manually
plot(pretest, posttest)
abline(18.78, 0.845)
# Plot the pretest variable against the posttest variable
plot(pretest, posttest)
# Create a regression model
mydata100 <- data.frame(workshop=workshop, gender=gender, pretest=pretest, posttest=posttest)
myModel <- lm(posttest ~ pretest, data = mydata100)
# Plot a regression analysis automatically
abline(coefficients(myModel))
# Plot the posttest variable against the pretest variable with the right embellishments
plot(posttest, pretest, pch=3, cex=2, main="Combination Plot", xlab="X: posttest", ylab="Y: pretest")
grid()
# Create a regression model and plot it
myModel <- lm(pretest ~ posttest)
abline(coefficients(myModel))
# Plot the workshop factor as a bar chart
library(ggplot2)
ggplot(mydata100, aes(workshop)) + geom_bar()
# Plot a bar chart of the workshop factor, filled with stacked gender information
ggplot(mydata100, aes(workshop, fill=gender)) + geom_bar(position="stack")
# Plot a bar chart of the gender factor in grey scale, filled with stacked workshop information
ggplot(mydata100, aes(gender, fill=workshop)) + geom_bar(position="stack") + scale_fill_grey()
# Plot a bar chart of the workshop factor in grey scale, filled with dodged gender information.
ggplot(mydata100, aes(workshop, fill=gender)) + geom_bar(position="dodge") + scale_fill_grey()
# Plot a grouped bar chart of the workshop factor, with the gender factor specifying the number of rows
ggplot(mydata100, aes(workshop)) + geom_bar() + facet_grid(gender ~ .)
# Make a grouped box plot of the workshop factor against the pretest variable, with the gender factor specifying the number of columns, superimposed by a scatter plot of the same data
ggplot(mydata100, aes(x=workshop, y=pretest)) + geom_boxplot() + facet_grid(. ~ gender) + geom_point()
# Make a scatter plot of the pretest variable against the posttest variable, specifying the shape of the points by the gender factor and setting their size to 5. Superimpose this plot with a regression analysis of the same data, specifying the line type again by the gender factor
ggplot(mydata100, aes(x=pretest, y=posttest, shape=gender, linetype=gender)) + geom_point(size=5) + geom_smooth(method="lm")
# Make a scatter plot of the pretest variable against the posttest variable, set the title of the plot to "Plot of Test Scores" and the x- and y-label to "Before Workshop" and "After Workshop", respectively
ggplot(mydata100, aes(x=pretest, y=posttest)) + geom_point() + labs(title="Plot of Test Scores", x="Before Workshop", y="After Workshop")
# Create a theme that starts from the theme theme_bw(), doubles the size of the title, and sets the major and minor grid lines (x and y) to white
my_white <- theme_bw() + theme(panel.grid.major.x = element_blank()) + theme(panel.grid.minor.x = element_blank()) + theme(panel.grid.major.y = element_blank()) + theme(panel.grid.minor.y = element_blank()) + theme(plot.title = element_text(size = rel(2)))
# Make a scatter plot of the pretest variable against the posttest variable, set the title of the plot to "Plot of Test Scores" and the x- and y-label to "Before Workshop" and "After Workshop", respectively, and add your own theme
ggplot(mydata100, aes(x=pretest, y=posttest)) + geom_point() + labs(title="Plot of Test Scores", x="Before Workshop", y="After Workshop") + my_white
# Create a theme that starts from the theme theme_bw(), doubles the size of the title, and sets the major and minor grid lines (x and y) to white
my_white <- theme_bw() + theme(panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank(), panel.grid.major.y = element_blank(), panel.grid.minor.y = element_blank(), plot.title = element_text(size = rel(2)))
# Make a scatter plot of the pretest variable against the posttest variable, set the title of the plot to "Plot of Test Scores" and the x- and y-label to "Before Workshop" and "After Workshop", respectively, and add your own theme
ggplot(mydata100, aes(x=pretest, y=posttest)) + geom_point() + labs(title="Plot of Test Scores", x="Before Workshop", y="After Workshop") + my_white
library(RColorBrewer)
# List the color palettes with four colors of the RColorBrewer package
display.brewer.all(n = 4)
# Plot a bar chart of the workshop factor, filled with stacked gender information, colored according to the palette Set2
ggplot(mydata100, aes(workshop, fill=gender)) + geom_bar(position="stack") + scale_fill_brewer(palette = "Set2")
# Create your theme
my_white <- theme_bw() + theme(panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank(), panel.grid.major.y = element_blank(), panel.grid.minor.y = element_blank(), plot.title = element_text(size = rel(3)))
# Plot!
ggplot(mydata100, aes(x=pretest, y=posttest, shape=gender, linetype=gender)) + geom_point(size=2) + facet_grid(workshop ~ gender) + labs(title="Combination Plot", x="Before Workshop", y="After Workshop") + geom_smooth(method="lm") + my_white
Chapter 16 - Writing Functions
Writing Functions - similar to macros in other languages:
Applying Functions by Group; Anonymous Functions:
Debugging Tips - general R programming tips, but especially useful for newly written functions:
Example code includes:
# Write a function mymean that returns the mean of a vector, removing the missing values and without naming the result
mymean <- function(x) { mean(x, na.rm=TRUE) }
# Apply mymean on `pretest`
mymean(pretest)
## [1] 74.97
# Write a function mystats that returns the mean, the standard deviation, the median, the maximum and the minimum of a vector, in that order, removing the missing values.
mystats <- function(x) {
c(mean=mean(x, na.rm=TRUE), sd=sd(x, na.rm=TRUE), median=median(x, na.rm=TRUE),
max=max(x, na.rm=TRUE), min=min(x, na.rm=TRUE)
)
}
# Apply mystats on pretest
mystats(pretest)
## mean sd median max min
## 74.970000 5.296187 75.000000 86.000000 58.000000
# Calculate the mean, standard deviation, median, maximum and minimum by using the pre-loaded function of the pretest variable that is grouped by gender
by(pretest, gender, mystats)
## gender: Female
## mean sd median max min
## 74.617021 5.289667 74.000000 86.000000 62.000000
## --------------------------------------------------------
## gender: Male
## mean sd median max min
## 75.283019 5.332691 75.000000 85.000000 58.000000
# Calculate the mean and the minimum (in that order and without names) in
# an anonymous function of the pretest variable that is grouped by gender.
by(pretest, gender, function(x) { c(mean(x), min(x)) } )
## gender: Female
## [1] 74.61702 62.00000
## --------------------------------------------------------
## gender: Male
## [1] 75.28302 58.00000
# Debug the code
by(pretest, gender, function(x){c(mean(na.rm = TRUE, x), sd(x, TRUE), median(x = x, na.rm = TRUE)) } )
## gender: Female
## [1] 74.617021 5.289667 74.000000
## --------------------------------------------------------
## gender: Male
## [1] 75.283019 5.332691 75.000000
Chapter 17 - Basic Statistics